Issue
I am using the following code to extract all the reviews for the first 20 pages from Trustpilot
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime as dt
# Initialize lists
review_titles = []
review_dates_original = []
review_dates = []
review_ratings = []
review_texts = []
page_number = []
# Set Trustpilot page numbers to scrape here
from_page = 1
to_page = 20
for i in range(from_page, to_page + 1):
response = requests.get(f"https://uk.trustpilot.com/review/www.abc.com?page={i}")
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
for review in soup.find_all(class_ = "paper_paper__1PY90 paper_square__lJX8a card_card__lQWDv card_noPadding__D8PcU card_square___tXn9 styles_navigationContainer__kPGA_"):
# Review titles
review_title = review.find(class_ = "typography_heading-s__f7029 typography_appearance-default__AAY17")
review_titles.append(review_title.getText())
# Review dates
review_date_original = review.select_one(selector="time")
review_dates_original.append(review_date_original.getText())
# Convert review date texts into Python datetime objects
review_date = review.select_one(selector="time").getText().replace("Updated ", "")
if "hours ago" in review_date.lower() or "hour ago" in review_date.lower():
review_date = dt.datetime.now().date()
elif "a day ago" in review_date.lower():
review_date = dt.datetime.now().date() - dt.timedelta(days=1)
elif "days ago" in review_date.lower():
review_date = dt.datetime.now().date() - dt.timedelta(days=int(review_date[0]))
else:
review_date = dt.datetime.strptime(review_date, "%b %d, %Y").date()
review_dates.append(review_date)
# Review ratings
review_rating = review.find(class_ = "star-rating_starRating__4rrcf star-rating_medium__iN6Ty").findChild()
review_ratings.append(review_rating["alt"])
# When there is no review text, append "" instead of skipping so that data remains in sequence with other review data e.g. review_title
review_text = review.find(class_ = "typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn")
if review_text == None:
review_texts.append("")
else:
review_texts.append(review_text.getText())
# Trustpilot page number
page_number.append(i)
# Create final dataframe from lists
df_reviews = pd.DataFrame(list(zip(review_titles, review_dates_original, review_dates,
review_ratings, review_texts, page_number)),
columns =['review_title', 'review_date_original', 'review_date',
'review_rating', 'review_text', 'page_number'])
and I am getting the following error
AttributeError Traceback (most recent call last)
in
24 # Review titles
25 review_title = review.find(class_ = "typography_heading-s__f7029
typography_appearance-default__AAY17")
---> 26 review_titles.append(review_title.getText())
27 # Review dates
28 review_date_original = review.select_one(selector="time")
AttributeError: 'NoneType' object has no attribute 'getText'
I understand that one of the review titles are coming out to be None and the same issue is persisting for other elements as well such as Dates, Pages, Review etc.
How should I resolve this?
Solution
I've done this exact exercis 3 months ago, the code that worked for me was the following:
from time import sleep
import requests
import pandas as pd
from bs4 import BeautifulSoup
def soup2list(src, list_, attr=None):
if attr:
for val in src:
list_.append(val[attr])
else:
for val in src:
list_.append(val.get_text())
users = []
userReviewNum = []
ratings = []
locations = []
dates = []
reviews = []
from_page = 1
to_page = 60
company = 'eshakti.com'
for i in range(from_page, to_page+1):
result = requests.get(fr"https://www.trustpilot.com/review/{company}?page={i}")
soup = BeautifulSoup(result.content)
# Trust Pilot was setup in a way that's not friendly to scraping, so this hacky method will do.
soup2list(soup.find_all('span', {'class','typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'}), users)
soup2list(soup.find_all('div', {'class','typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua'}), locations)
soup2list(soup.find_all('span', {'class','typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l'}), userReviewNum)
soup2list(soup.find_all('div', {'class','styles_reviewHeader__iU9Px'}), dates)
soup2list(soup.find_all('div', {'class','styles_reviewHeader__iU9Px'}), ratings, attr='data-service-review-rating')
soup2list(soup.find_all('div', {'class','styles_reviewContent__0Q2Tg'}), reviews)
# To avoid throttling
sleep(1)
review_data = pd.DataFrame(
{
'Username':users,
'Total reviews':userReviewNum,
'location':locations,
'date':dates,
'content':reviews,
'Rating': ratings
})
The likely reason is that the HTML header names changed overtime. Note that you'll need to change the i
range in the for loop to match the pages and change the company name.
Answered By - Yazeed Alnumay
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.