Issue
I am trying to assign a number for the instance that the loop gathered the data for each instance, for a later lookup.
The issue that I am having is it will just assign one single number to the column for ALL the looped instances or it just pretty much tells me that it can't and tells me:
ValueError: Length of values (1) does not match length of index (2)
All I want to do is add a number to a column so I know when that instance was pulled through the code.
Please help, I have been banging my head against the wall here.
Thanks y'all
Here is my code:
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
i = 0
df = pd.DataFrame()
df[state] = i
for state in states:
x = state
driver = webdriver.Chrome(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES"})
dfs['STATE'] = i
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
i = i + 1
dfs
Solution
That's precisely what I use enumerate
for. So going off of QHarr's code (so accept his solution, but I'm just adding to it...), you can see the slight difference (not needing to set r=0
and then have to increment r+=1
).
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
states = ["Washington", "Oregon"]
period = "2020"
num_states = len(states)
state_list = []
df = pd.DataFrame()
for r, state in enumerate(states): #<- r will be the index position from the list `states` as it iterates through
x = state
driver = webdriver.Chrome()#(executable_path = 'C:/webdrivers/chromedriver.exe')
driver.get('https://www.nbc.gov/pilt/counties.cfm')
driver.implicitly_wait(20)
state_s = driver.find_element(By.NAME, 'state_code')
drp = Select(state_s)
drp.select_by_visible_text(state)
year_s = driver.find_element(By.NAME, 'fiscal_yr')
drp = Select(year_s)
drp.select_by_visible_text(period)
driver.implicitly_wait(10)
link = driver.find_element(By.NAME, 'Search')
link.click()
url = driver.current_url
page = requests.get(url)
#dfs = pd.read_html(addrss)[2]
# Get the html
soup = BeautifulSoup(page.text, 'lxml')
state_build = url.split('code=',1)[1]
state_id = state_build[:2]
table = soup.findAll('table')[2]
headers = []
for i in table.find_all('th'): # you use i here!
title = i.text.strip()
headers.append(title)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
row_data.append(r)
length = len(df)
df = df.append(row_data)
dfs = df.set_index(df.groupby(level = 0)\
.cumcount(), append = True).stack()\
.unstack(0)\
.rename(columns={0 : 'COUNTY', 1: 'PRICE', 2: "TOTAL ACRES", 3:"LOOP"})
time.sleep(5)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.findAll('table')[1]
headers = []
dfs
HOWEVER...any reason you're not using pandas here and just formatting the url?
import pandas as pd
states = ["WA", "OR"]
period = "2020"
dfs = []
for state in states:
df = pd.read_html('https://www.nbc.gov/pilt/counties.cfm?term=county&state_code={state}&fiscal_yr={period}'.format(state=state,period=period))[-1]
df['State'] = state
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
Output:
print(df)
COUNTY PAYMENT PAYMENT.1 PAYMENT.2 TOTAL ACRES State
0 ADAMS COUNTY $59,408 $59,408 $59,408 21337 WA
1 ASOTIN COUNTY $174,550 $174,550 $174,550 71580 WA
2 BENTON COUNTY $181,659 $181,659 $181,659 64264 WA
3 CHELAN COUNTY $3,244,827 $3,244,827 $3,244,827 1486918 WA
4 CLALLAM COUNTY $1,101,485 $1,101,485 $1,101,485 523298 WA
.. ... ... ... ... ... ...
71 WASCO COUNTY $87,973 $87,973 $87,973 220099 OR
72 WASHINGTON COUNTY $39,545 $39,545 $39,545 13984 OR
73 WHEELER COUNTY $120,613 $120,613 $120,613 301762 OR
74 YAMHILL COUNTY $38,627 $38,627 $38,627 58311 OR
75 TOTAL $23,321,995 $23,321,995 $23,321,995 31312205 OR
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.