Issue
I would like to scrape the product description of every product from the search results. The search results are in 50 pages with 60 products per page. So, in total I need to scrape 3000 products. With the code I currently have, this error comes up on random runs:
---------------------------------------------------------------------------
TimeoutException Traceback (most recent call last)
/var/folders/hj/yrd6ng651fv5d2_gtngcysy40000gn/T/ipykernel_50778/696262163.py in <module>
34 for link in product_links:
35 driver.get(link)
---> 36 WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CLASS_NAME, "page-product")))
37
38 driver.execute_script("""
/opt/miniconda3/lib/python3.9/site-packages/selenium/webdriver/support/wait.py in until(self, method, message)
78 if time.time() > end_time:
79 break
---> 80 raise TimeoutException(message, screen, stacktrace)
81
82 def until_not(self, method, message=''):
TimeoutException: Message:
Sometime the error happens after it scrapes 60 data, sometimes after 300 data, sometimes not even scrape a single data.
I tried to modified the WebDriverWait from the 10 up to 100. Still, it doesn't fix the issue.
Anyone know how to fix this?
Here is my code:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import json
from turtle import delay
import time
import pandas as pd
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,6572.610")
chrome_options.add_argument('--disable-infobars')
# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
baseurl = 'https://shopee.co.id'
product_links = []
for page in range(5, 11):
search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
driver.get(search_link)
WebDriverWait(driver, 80).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
for item in product_list:
for link in item.find_all('a', href=True):
product_links.append(baseurl + link['href'])
#testlink = 'https://shopee.co.id/Obat-Herbal-Kanker-Payudara-Serviks-Hati-Usus-Prostat-Leukimia-dan-Paru-Paru-ORIGINAL-100-ASLI-i.166801435.2584201334?sp_atk=70c736d4-ed07-435c-8edd-4e6e7552a91d&xptdk=70c736d4-ed07-435c-8edd-4e6e7552a91d'
herbcancerlist = []
for link in product_links:
driver.get(link)
WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.CLASS_NAME, "page-product")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(10)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
try:
name = soup.find('div', class_='_2rQP1z').get_text()
price = soup.find('div', class_='_2Shl1j').get_text()
sold = soup.find('div', class_ = 'HmRxgn').get_text()
rate = soup.find('div', class_ = '_3y5XOB _14izon').get_text()
city = soup.find('span', class_ = '_2fJrvA').get_text()
specification = soup.find('div', class_ = '_2jz573').get_text()
except:
name = 'No name'
price = 'No price'
sold = 'No value'
rate = 'No rate'
city = 'No city'
specification = 'No spec'
herbcancer = {
'name': name,
'price': price,
'sold': sold,
'rate': rate,
'city': city,
'specification': specification
}
herbcancerlist.append(herbcancer)
print('Saving: ', herbcancer['name'])
df = pd.DataFrame(herbcancerlist)
print(df.head())
Solution
This one way of getting all those products' info, put them into a dataframe, and save that dataframe to disk as a csv file:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import json
import pandas as pd
from tqdm.notebook import tqdm
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
chrome_options.add_argument('--load-extension=/home/user/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.44.0_1')
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
big_df = pd.DataFrame()
for x in tqdm(range(50)):
url = f'https://shopee.co.id/search?keyword=obat%20kanker&page={x}'
browser.get(url)
t.sleep(5)
items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]')))
for i in items:
json_obj = json.loads(i.get_attribute('innerHTML'))
if json_obj['@type'] == 'Product':
big_df = pd.concat([big_df, pd.json_normalize(json_obj)], axis=0, ignore_index=True)
t.sleep(1)
print(big_df)
big_df.to_csv('medicinal_herbs_indonesian.csv')
Note the use of an adblocking extension, and the use of pandas/padas concat. Result printed in terminal:
@context @type name description url productID image brand offers.@type offers.price offers.priceCurrency offers.availability aggregateRating.@type aggregateRating.bestRating aggregateRating.worstRating aggregateRating.ratingCount aggregateRating.ratingValue offers.lowPrice offers.highPrice
0 http://schema.org Product GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 https://shopee.co.id/GRAVIDA-BHARATA-OBAT-KANKER-PAYUDARA-AMPUH-KANKER-GANAS-HERBAL-TERDAFTAR-DBPOM-MUI-WARYANTO076-i.282306593.7674724221 7674724221 https://cf.shopee.co.id/file/40d3d0c5a7fc388294950b7586081843 Offer 275000.00 IDR http://schema.org/InStock AggregateRating 5.0 1.0 252 4.89 NaN NaN
1 http://schema.org Product Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping https://shopee.co.id/Walatra-Zedoril-7-Asli-Obat-Herbal-Kanker-Tumor-Dan-Segala-Jenis-Benjolan-Aman-Tanpa-Efek-Samping-i.189502097.3139156637 3139156637 https://cf.shopee.co.id/file/b0b7d9c09666f2fd237f3279b8194dc4 Walatra Offer 255000.00 IDR http://schema.org/InStock AggregateRating 5.0 1.0 1272 4.80 NaN NaN
2 http://schema.org Product Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI https://shopee.co.id/Obat-Herbal-Kanker-Payudara-Serviks-Hati-Usus-Prostat-Leukimia-dan-Paru-Paru-ORIGINAL-100-ASLI-i.166801435.2584201334 2584201334 https://cf.shopee.co.id/file/1a159f810da5508ec3330ce174ca2eab Offer 525000.00 IDR http://schema.org/InStock AggregateRating 5.0 1.0 1197 4.90 NaN NaN
3 http://schema.org Product IDR Madu Hitam / Obat Kanker / Obat Kanker Serviks 350 gram https://shopee.co.id/IDR-Madu-Hitam-Obat-Kanker-Obat-Kanker-Serviks-350-gram-i.12836685.4460609439 4460609439 https://cf.shopee.co.id/file/3b96e9f5fb11977b946bb5e1c4585a72 IDR Offer 250000.00 IDR http://schema.org/InStock AggregateRating 5.0 1.0 6 4.83 NaN NaN
[....]
Following on this logic, you can get the individual products' urls from big_df['url'], and scrape them one by one, also using that adblock extension. To make the code more redundant, you can use a database, save the big dataframe there, save the products there, use try/except blocks, mark the ones successfully scraped, retry the others.
Answered By - Barry the Platipus
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.