Issue
I have been struggling with how to get the list of links from multiple pages of a forum with serp paged content. My code works well (my goal is to dump all conversations for a search result into a pdf), but does not work past the first page of threads. When I do a quick page source compare of the 2 urls, I can see the problem. The second url adds '#serp=2' and loads correctly, but the page source is the same with the links from the first page.
Here is my code below. Any suggestions for how to pull results from subsequent pages or is there any way to pull all results at once?
#! python3
# getE2EResults.py - Opens all E2E threads and saves them to a file.
import requests, sys, webbrowser, bs4, pdfkit
from pypac import PACSession
session = PACSession()
path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
site_list = []
print('Searching...') # display text while downloading
res = session.get('http://e2e.ti.com/search?q=' + ''.join(sys.argv[1:]) + '&category=forum&date=&customdaterange=0&startdate=&enddate=')
res.raise_for_status()
# Retrieve top search result links.
soup = bs4.BeautifulSoup(res.text,'lxml')
# Find the number of pages in search results
mydivs = soup.findAll("div", {"class": "search-view-by-sort"})
string1 = mydivs[0].text
numberOfResults = [int(s) for s in string1.split() if s.isdigit()]
numberOfPages = (numberOfResults[0]//10)
if (numberOfResults[0]%10 > 0):
numberOfPages += 1
print(str(numberOfPages) + ' pages of results')
###########################################
# Find all 10 post links for the first page, add to site list
linkElems = soup.select('.name a')
numOpen = min(10, len(linkElems))
for i in range(numOpen):
res1 = session.get(linkElems[i].get('href'))
res1.raise_for_status()
site_list.append(linkElems[i].get('href'))
# soup1 = bs4.BeautifulSoup(res1.text)
# webbrowser.open(linkElems[i].get('href'))
# Repeat for all pages in search results
if (numberOfPages > 1):
for n in range(2,(numberOfPages+1)):
res = session.get('http://e2e.ti.com/search?q=' + ''.join(sys.argv[1:]) + '&category=forum&date=&customdaterange=0&startdate=&enddate=#serp='+str(n))
#print('http://e2e.ti.com/search?q=' + ''.join(sys.argv[1:]) + '&category=forum&date=&customdaterange=0&startdate=&enddate=#serp='+str(n))
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'lxml')
linkElems = soup.select('.name a')
numOpen = min(10, len(linkElems))
for i in range(numOpen):
res1 = session.get(linkElems[i].get('href'))
res1.raise_for_status()
site_list.append(linkElems[i].get('href'))
counter = 1
for item in site_list:
print(str(counter) + ' ' + item)
'''
# Create pdf of all Results
#print(site_list)
counter = 1
for item in site_list:
pdfkit.from_url(item, 'out'+str(counter)+'.pdf', configuration=config)
counter += 1
#pdfkit.from_url(site_list, ''.join(sys.argv[1:])+'.pdf', configuration=config)
'''
Solution
The easiest approach is to search for the next page URL and use that for the next request. When the button is missing, you know you have reached the last page:
from bs4 import BeautifulSoup
import requests
def get_page_urls(html):
soup = BeautifulSoup(html, 'lxml')
# Find the number of pages in search results
number_of_pages = int(soup.find(class_='search-view-by-sort').span.text.split(' ')[2].replace(',', '')) // 10
# Find the URL for the next page
next_url = soup.find('a', class_='next')
if next_url:
next_url = base_url + next_url['href']
# Display/store all of the links
for link in soup.select('.name a'):
site_list.append(link['href'])
print(' ', link['href'])
return number_of_pages, next_url
site_list = []
page_number = 1
jar = requests.cookies.RequestsCookieJar()
base_url = 'http://e2e.ti.com'
search = 'Beaglebone black'
url = '{}/search?q={}&category=forum&date=&customdaterange=0&startdate=&enddate='.format(base_url, search)
print("Page 1")
res = requests.get(url, cookies=jar)
number_of_pages, url = get_page_urls(res.text)
while url:
page_number += 1
print("Page {} of {}".format(page_number, number_of_pages))
res = requests.get(url, cookies=jar)
_, url = get_page_urls(res.text)
This code keeps requesting pages and storing the URLs until all pages have been received. Note, the search is hard coded for the sake of testing.
This would give you results starting like:
Page 1
http://e2e.ti.com/support/arm/sitara_arm/f/791/t/270719?tisearch=e2e-sitesearch&keymatch=Beaglebone black
http://e2e.ti.com/support/embedded/linux/f/354/t/483988?tisearch=e2e-sitesearch&keymatch=Beaglebone black
..
..
Page 2 of 308
http://e2e.ti.com/support/embedded/starterware/f/790/t/301790?tisearch=e2e-sitesearch&keymatch=Beaglebone black
http://e2e.ti.com/support/arm/sitara_arm/f/791/t/501015?tisearch=e2e-sitesearch&keymatch=Beaglebone black
..
..
Page 3 of 308
http://e2e.ti.com/support/embedded/starterware/f/790/p/285959/1050634?tisearch=e2e-sitesearch&keymatch=Beaglebone black#1050634
..
..
Answered By - Martin Evans
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.