Sunday, December 31, 2023

[FIXED] Scrapy- not able to navigate to next page

December 31, 2023 python, scrapy, xpath No comments

Issue

I am running a Scrapy project where I want to extract user posts from a social forum. At the moment I am having trouble getting my spider to navigate to the next page in the forum. I think the problem is that I am not getting the correct xpath expression. Here is my spider program:

import scrapy
from datetime import datetime


class PeripartumSpider(scrapy.Spider):

    name = 'babyctr'

    start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']

# Going into each post link from the main page
# Note: Xpaths may need to be edited according to each website

    def parse(self, response):
        for post_link in response.xpath('//*[@class="titleText"]/a/@href').extract():
            link = response.urljoin(post_link)
            yield scrapy.Request(link, callback=self.parse_thread)

# Checking if there is a link to next page in the main page

        next_page = response.xpath('//*[@class= "__next"]/@href').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)


# Going into each post and extracting information.

    def parse_thread(self, response):
        original_post = response.xpath("//*[@class='content']/p/text()").extract()
        title = response.xpath("//*[@id='post_title']/text()").extract()
        author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
        timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
        number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
        yield {
            "title": title,
            "date": timestamp,
            "author_name": author_name,
            "post": original_post,
            "comments": number_comments}

# Getting the comments and their information for each post

        author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
        response_author_list = author_list[1::]
        comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
        post_dates = response.xpath("//*[@class='finePrint']/text()").extract()

        reply_dates = post_dates[1::]

        author_comment_date = zip(response_author_list, comment_response_list, reply_dates)

        for x, y, z in author_comment_date:
            yield{
                "title" : title,
                "date" : z,
                "author_name" : x,
                "post": y,
                "comments": 'Comments (0)'

            }

Can someone please help me see why I am not getting to the next page? I have tried many alternatives such as: response.xpath('//*[@label= "pagination"]/@href') but I am getting either a wrong node or wrong attribute name. The website I am trying to crawl is included in the spider program.

Thank you in advance!

Solution

The problem is javascript, not your xpath.

import scrapy
from datetime import datetime


class PeripartumSpider(scrapy.Spider):

    name = 'babyctr'

    start_urls = ['https://community.babycenter.com/groups/a15325/postpartum_depression_anxiety_and_related_topics']
    page = 1

    # Going into each post link from the main page
    # Note: Xpaths may need to be edited according to each website

    def parse(self, response):
        post_links = response.xpath('//*[@class="titleText"]/a/@href').extract()
        for post_link in post_links:
            link = response.urljoin(post_link)
            yield scrapy.Request(link, callback=self.parse_thread)

        # if there are no posts we are done
        if post_links:
            self.page += 1
            next_page = f'?page={self.page}'
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page)

    # Going into each post and extracting information.
    def parse_thread(self, response):
        original_post = response.xpath("//*[@class='content']/p/text()").extract()
        title = response.xpath("//*[@id='post_title']/text()").extract()
        author_name = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract_first()
        timestamp = response.xpath("//*[@class='finePrint']/text()").extract_first()
        number_comments = response.xpath("//*[@id='post_comments']/h2/text()").extract()
        yield {
            "title": title,
            "date": timestamp,
            "author_name": author_name,
            "post": original_post,
            "comments": number_comments}

        # Getting the comments and their information for each post

        author_list = response.xpath("//*[@class='user_nickname textLinks']/a/text()").extract()
        response_author_list = author_list[1::]
        comment_response_list = response.xpath(".//*[@class='post_content bodyText clear']/p//text()").extract()
        post_dates = response.xpath("//*[@class='finePrint']/text()").extract()

        reply_dates = post_dates[1::]

        author_comment_date = zip(response_author_list, comment_response_list, reply_dates)

        for x, y, z in author_comment_date:
            yield{
                "title": title,
                "date": z,
                "author_name": x,
                "post": y,
                "comments": 'Comments (0)'
            }

This fixes the pagination, but notice that you xpath's inside parse_thread are wrong.

Answered By - SuperUser

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, December 31, 2023

[FIXED] Scrapy- not able to navigate to next page

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels