Issue
I am running a Scrapy project where I want to extract user posts from a social forum. At the moment I am having trouble getting my spider to navigate to the next page in the forum. I think the problem is that I am not getting the correct xpath expression. Here is my spider program:
import scrapy
from datetime import datetime
class PeripartumSpider(scrapy.Spider):
name = 'babyctr'
start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']
# Going into each post link from the main page
# Note: Xpaths may need to be edited according to each website
def parse(self, response):
for post_link in response.xpath('//*[@class="titleText"]/a/@href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# Checking if there is a link to next page in the main page
next_page = response.xpath('//*[@class= "__next"]/@href').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[@class='content']/p/text()").extract()
title = response.xpath("//*[@id='post_title']/text()").extract()
author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
yield {
"title": title,
"date": timestamp,
"author_name": author_name,
"post": original_post,
"comments": number_comments}
# Getting the comments and their information for each post
author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
response_author_list = author_list[1::]
comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
post_dates = response.xpath("//*[@class='finePrint']/text()").extract()
reply_dates = post_dates[1::]
author_comment_date = zip(response_author_list, comment_response_list, reply_dates)
for x, y, z in author_comment_date:
yield{
"title" : title,
"date" : z,
"author_name" : x,
"post": y,
"comments": 'Comments (0)'
}
Can someone please help me see why I am not getting to the next page? I have tried many alternatives such as: response.xpath('//*[@label= "pagination"]/@href') but I am getting either a wrong node or wrong attribute name. The website I am trying to crawl is included in the spider program.
Thank you in advance!
Solution
The problem is javascript, not your xpath.
import scrapy
from datetime import datetime
class PeripartumSpider(scrapy.Spider):
name = 'babyctr'
start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']
page = 1
# Going into each post link from the main page
# Note: Xpaths may need to be edited according to each website
def parse(self, response):
post_links = response.xpath('//*[@class="titleText"]/a/@href').extract()
for post_link in post_links:
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_thread)
# if there are no posts we are done
if post_links:
self.page += 1
next_page = f'?page={self.page}'
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page)
# Going into each post and extracting information.
def parse_thread(self, response):
original_post = response.xpath("//*[@class='content']/p/text()").extract()
title = response.xpath("//*[@id='post_title']/text()").extract()
author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
yield {
"title": title,
"date": timestamp,
"author_name": author_name,
"post": original_post,
"comments": number_comments}
# Getting the comments and their information for each post
author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
response_author_list = author_list[1::]
comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
post_dates = response.xpath("//*[@class='finePrint']/text()").extract()
reply_dates = post_dates[1::]
author_comment_date = zip(response_author_list, comment_response_list, reply_dates)
for x, y, z in author_comment_date:
yield{
"title": title,
"date": z,
"author_name": x,
"post": y,
"comments": 'Comments (0)'
}
This fixes the pagination, but notice that you xpath's inside parse_thread
are wrong.
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.