Issue
I am trying to pull the names of current U.S. members of Congress from https://www.congress.gov/members?q=%7B%22congress%22:[%22117%22%2C%22118%22]%7D&pageSize=250&page=1&searchResultViewType=expanded&KWICView=true. I'm leveraging the code I found here and it's exactly the same, but I keep getting the NoneType
error when I call get_num_pages.
I thought that was because the page_num_element wasn't returning anything, but when I look at the source code of the website I can't see anything wrong with why it wouldn't work. See snip below:
Here's the code:
import json
import re
import requests
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
congress_numbers = [117,118]
page_size = 250
def get_congress_url(congress_numbers, page_size, page):
congress_q_str = "{" + '"congress":[{}]'.format(','.join('"{0}"'.format(num) for num in congress_numbers)) + "}"
params= {
"q": congress_q_str,
"pageSize" : page_size,
"page" : page,
"searchResultViewType": "expanded",
"KWICView": "true"
}
actual_params = urllib.parse.urlencode(params, safe='{}:[]')
return "https://www.congress.gov/members?{}".format(actual_params)
def get_num_pages(soup):
page_num_element = soup.find("div", {"class": "basic-search-tune-number"}).find("div", {"class": "pagination"}).find("span", {"class": "results-number"}).text
page_num_raw = [int(s) for s in page_num_element.split() if s.isdigit()]
return int(page_num_raw[0])
def extract_member_name_url(entry):
result_heading = entry.find("span", {"class": "result-heading"})
member_name = result_heading.text
if "Representative" in member_name:
member_name = member_name.replace("Representative", "").strip()
elif "Senator" in member_name:
member_name = member_name.replace("Senator", "").strip()
url = result_heading.a['href']
return [member_name, url]
def extract_congress_members(congress_list):
members = []
for entry in congress_list:
member = {}
[member["name"], member["url"]] = extract_member_name_url(entry)
members.append(member)
return members
#Now that we know how many pages we'll scrape, we can start scraping
congress_url = get_congress_url(congress_numbers, page_size, 1)
print("Congress URL: ", congress_url)
response = requests.get(congress_url).text
soup = BeautifulSoup(response, "html.parser")
#We need the number of pages we will need to scrape
num_pages = get_num_pages(soup)
print("Number of result pages: ", num_pages)
Solution
Better check your response
/ soup
to get an idea - Your code becomes outdated because site is checking for javascript, that is not supported by requests
.
So you may use selenium
to mimic browser behaviour, wait for rendering of content, clicking next page if available, wait for actionLoaderWrapper
to be gone and repeat.
Just a simple example, to outline the basic steps:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
driver = webdriver.Firefox()
driver.maximize_window()
url = f'https://www.congress.gov/members?q=%7B%22congress%22%3A%5B%22117%22%2C%22118%22%5D%7D&pageSize=250'
driver.get(url)
data=[]
while True:
WebDriverWait(driver, 30).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.actionLoaderWrapper')))
items = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#search-results-wrapper ol.basic-search-results-lists li.expanded')))
for e in items:
## perform actions on the elements that are needed ...
d = {
'name':e.find_element(By.CSS_SELECTOR,'.result-heading a').text.split(' - ')[0]
}
d.update(dict(x.text.strip().split(':',1) for x in e.find_elements(By.CSS_SELECTOR,'.result-item')))
data.append(d)
## just for demonstration purpose, to get all results from each page, dissolve the release the "handbr(e)ake"
break
try:
print(driver.find_element(By.CSS_SELECTOR, 'a.next[href]').get_attribute('href'))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.next[href]'))).click()
except NoSuchElementException:
break
driver.close()
pd.DataFrame(data)
Output
name | State | District | Party | Served | |
---|---|---|---|---|---|
0 | Adams, Alma S. | North Carolina | 12 | Democratic | House: 2014-Present |
1 | Herrera Beutler, Jaime | Washington | 3 | Republican | House: 2011-2023 |
2 | Rutherford, John H. | Florida | 5 | Republican | House: 2017-Present |
...
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.