Issue
I wrote a script that scrapes data from a site with school information. The 'search' page being scraped has school districts in a drop-down box. The script loops through all districts and extracts data from the page of each district, which has multiple schools, with some districts having schools across several pages.
I would like to add all of the schools to a DataFrame and then write the data to a file.
However, I get the following error ValueError: Length mismatch: Expected axis has 87 elements, new values have 2 elements
.
Here is the code.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
districts = "https://web36.gov.mb.ca/school/school?action=district"
soup = BeautifulSoup(requests.get(districts).content, "html.parser")
options = soup.find("select",{"name":"DivisionSelection"}).findAll("option")
name = [option.get_text(strip=True) for option in options[1:]]
id_ = [option.get('value') for option in options[1:]]
id_.pop(1)
id_.pop(4)
mylist = []
for i in id_:
search_page = f"https://web36.gov.mb.ca/school/school?action=district&DivisionSelection={i}"
request = requests.get(search_page)
soup = BeautifulSoup(request.text, "lxml")
n_schools = soup.findAll('div', attrs = {'class':'n_schools'})
n_schools= re.findall("[0-9]+",str(n_schools))
High=n_schools[2]
schools = f"https://web36.gov.mb.ca/school/school?action=district&High={High}&Low=1&DivisionSelection={i}"
request = requests.get(schools)
soup = BeautifulSoup(request.text, "lxml")
data = soup.findAll('div', attrs = {'class':'sc_address'})
data = [[x.get_text(separator="<br>", strip=True) for x in y.findAll('div')] for y in data]
mylist.append(data)
df = pd.DataFrame(mylist[1:])
headerName=['Name', 'Info']
df.columns=headerName
df['Address'] = df['Info'].str.split("<br>", n = 1, expand=True)[0]
df['City'] = df['Info'].str.split("<br>", n = 2, expand=True)[1]
df['Postal Code'] = df['Info'].str.split("<br>", n = 3, expand=True)[2]
df['Phone'] = df['Info'].str.split("<br>", n = 5, expand=True)[4]
df['Fax'] = df['Info'].str.split("<br>", n = 7, expand=True)[6]
df['Grades'] = df['Info'].str.split("<br>", n = 9, expand=True)[8]
df['Program'] = df['Info'].str.split("<br>", n = 11, expand=True)[10]
df = df.drop('Info', axis=1)
df.to_csv("output.tsv", sep = "\t",index=False)
Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
with requests.Session() as req:
params = {
'action': 'district'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
opts = [(x['value'], x.text) for x in soup.select(
"select[name='DivisionSelection'] > option:not(:first-child)")]
allin = []
for sc in opts:
data = {
'DivisionSelection': sc[0],
'SchoolsInDistrict': 'submit'
}
r = req.post(
'https://web36.gov.mb.ca/school/school?action=district', data=data)
soup = BeautifulSoup(r.text, 'lxml')
max = soup.select_one('.n_schools strong')
if max:
mx = max.text.rsplit(maxsplit=1)[-1]
params = {
'High': mx,
'Low': '1',
'DivisionSelection': sc[0],
'action': 'district'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = soup.select('div.sc_address:not(:first-child)')
if goal:
print(sc[1])
target = [[sc[1], *x.stripped_strings] for x in goal]
for x in target:
del x[5::2]
allin.extend(target)
df = pd.DataFrame(allin)
print(df)
if __name__ == "__main__":
main('https://web36.gov.mb.ca/school/school')
Output:
0 ... 8
0 Adult Learning Centres ... English
1 Adult Learning Centres ... English
2 Adult Learning Centres ... English
3 Adult Learning Centres ... English
4 Adult Learning Centres ... English
.. ... ... ...
960 Winnipeg School Division ... Early Immersion
961 Winnipeg School Division ... Early Immersion, English, French: Communicatio...
962 Winnipeg School Division ... Early Immersion, English
963 Winnipeg School Division ... Early Immersion
964 Winnipeg School Division ... Early Immersion, English
[965 rows x 9 columns]
You can use df.to_csv('data.csv',index=False)
Answered By - αԋɱҽԃ αмєяιcαη
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.