Issue
I am new to data science and learning how to do web scrapping. I am trying to scrap reviews on particular products, but I am not able to scrap the whole reviews.
By code is working fine with no error, actually first it was giving error as bellow then i changed the code .
Old Code:
def perform_scraping(link):
response = requests.get(link)
soup = BeautifulSoup(response.content,"html.parser")
all_div = soup.find_all("div", class_="_1AtVbE col-12-12")
del all_div[0]
for review in range(1,len(all_div)):
rating = all_div[review].find("div", class_ = "_3LWZlK _1BLPMq").text
review_title = all_div[review].find("p","_2-N8zT").text
all_reviews.append((rating,review_title))
all_reviews = []
for page_number in range(1,2):
i = page_number
link = f"https://www.flipkart.com/prowl-tiger-shroff-push-up-board-upper-body-workout-push-up-bar/product-reviews/itm0487671f4df34?pid=BAAGM82GUHTQ3KFZ&lid=LSTBAAGM82GUHTQ3KFZVCWDQ9&marketplace=FLIPKART&page={i}"
perform_scraping(link)
all_reviews
Error:
AttributeError Traceback (most recent call last)
Cell In[3], line 15
13 i = page_number
14 link = f"https://www.flipkart.com/prowl-tiger-shroff-push-up-board-upper-body-workout-push-up-bar/product-reviews/itm0487671f4df34?pid=BAAGM82GUHTQ3KFZ&lid=LSTBAAGM82GUHTQ3KFZVCWDQ9&marketplace=FLIPKART&page={i}"
---> 15 perform_scraping(link)
16 all_reviews
Cell In[3], line 7, in perform_scraping(link)
5 del all_div[0]
6 for review in range(1,len(all_div)):
----> 7 rating = all_div[review].find("div", class_ = "_3LWZlK _1BLPMq").text
8 review_title = all_div[review].find("p","_2-N8zT").text
9 all_reviews.append((rating,review_title))
AttributeError: 'NoneType' object has no attribute 'text'
I am sharing the updated code bellow please help.
Updated Code:
def perform_scraping(link):
response = requests.get(link)
soup = BeautifulSoup(response.content, "html.parser")
all_div = soup.find_all("div", class_="_1AtVbE col-12-12")
del all_div[0]
all_reviews = []
for review in range(0, len(all_div)):
rating_elem = all_div[review].find("div", class_="_3LWZlK _1BLPMq")
review_title_elem = all_div[review].find("p", class_="_2-N8zT")
if rating_elem is not None and review_title_elem is not None:
rating = rating_elem.text
review_title = review_title_elem.text
all_reviews.append((rating, review_title))
return all_reviews
all_reviews = []
for page_number in range(1, 2):
link = f"https://www.flipkart.com/prowl-tiger-shroff-push-up-board-upper-body-workout-push-up-bar/product-reviews/itm0487671f4df34?pid=BAAGM82GUHTQ3KFZ&lid=LSTBAAGM82GUHTQ3KFZVCWDQ9&marketplace=FLIPKART&page={page_number}"
all_reviews.extend(perform_scraping(link))
print(all_reviews)
Please help.
Thanks in advance.
I hope to here from you soon.
Solution
Taking your example, you aren't traversing all the pages for the reviews. Your loop goes from 1 to 2, where 2 is exclusive, but you are trying to scrape page 2.
def perform_scraping(link):
response = requests.get(link)
soup = BeautifulSoup(response.content, "html.parser")
all_div = soup.find_all("div", class_="_1AtVbE col-12-12")
del all_div[0]
all_reviews = []
for review in range(0, len(all_div)):
rating_elem = all_div[review].find("div", class_="_3LWZlK _1BLPMq")
review_title_elem = all_div[review].find("p", class_="_2-N8zT")
if rating_elem is not None and review_title_elem is not None:
rating = rating_elem.text
review_title = review_title_elem.text
all_reviews.append((rating, review_title))
return all_reviews
all_reviews = []
for page_number in range(1, 3):
link = f"https://www.flipkart.com/prowl-tiger-shroff-push-up-board-upper-body-workout-push-up-bar/product-reviews/itm0487671f4df34?pid=BAAGM82GUHTQ3KFZ&lid=LSTBAAGM82GUHTQ3KFZVCWDQ9&marketplace=FLIPKART&page={page_number}"
all_reviews.extend(perform_scraping(link))
print(all_reviews)
This is your updated code for the same.
And, I have made the code more dynamic for the pages and cleaned it up a bit.
from bs4 import BeautifulSoup
import re
import requests
def perform_scraping(link):
soup = BeautifulSoup(requests.get(link).text, 'lxml')
reviews = []
for review in soup.find_all('div', {'class': '_27M-vq'}):
rating = review.find('div', {'class': re.compile('_3LWZlK .*_1BLPMq')})
review_title = review.find('p', {'class': '_2-N8zT'})
if rating and review_title:
reviews.append((rating.text, review_title.text))
return reviews
all_reviews = []
product_url = 'https://www.flipkart.com/prowl-tiger-shroff-push-up-board-upper-body-workout-push-up-bar/product-reviews/itm0487671f4df34'
response = requests.get(product_url)
soup = BeautifulSoup(response.text, "html.parser")
if soup.find('div', {'class': '_2MImiq _1Qnn1K'}).find('span', string=re.compile('Page.* of (\d+)')):
pages = int(soup.find('div', {'class': '_2MImiq _1Qnn1K'}).find('span', string=re.compile('Page.* of (\d+)')).text.strip().split(' ')[-1])
for page in range(1, pages+1):
review_page = f"{product_url}?page={page}"
all_reviews.extend(perform_scraping(review_page))
else:
all_reviews.extend(perform_scraping(product_url))
print(all_reviews)
Both of the snippets work the same for the given example, but the latter is dynamic in the sense and can work for any number of pages.
Output (First code):
[('4', 'Worth the money'), ('5', 'Classy product'), ('5', 'Wonderful'), ('5', 'Great product'), ('4', 'Really Nice'), ('5', 'Just wow!'), ('5', 'Highly recommended.'), ('5', 'Brilliant'), ('5', 'Terrific')]
Output (2nd Code):
[('4', 'Worth the money'), ('5', 'Classy product'), ('2', 'Moderate'), ('1', 'Very poor'), ('5', 'Wonderful'), ('5', 'Great product'), ('1', 'Waste of money!'), ('4', 'Really Nice'), ('5', 'Just wow!'), ('1', 'Useless product'), ('5', 'Highly recommended.'), ('5', 'Brilliant'), ('5', 'Terrific'), ('1', 'Utterly Disappointed')]
Answered By - Zero
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.