Issue
I would like scrape data https://www.arduinothai.com/category/2/arduino-compatible-board using python 3.5 and BeautifulSoup. I can succesfully scrape the data on first page, but I can not able to scrape data from the other pages. This is my code
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
URL='https://www.arduinothai.com/category/2/arduino-compatible-board'
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
''' Find All page in website
Count_Next_Pages = soups.find_all('span','tsk-all')
TotalProduct = int(Count_Next_Pages[1].text)
TotalProductPerPage = 40
TotalPages = (round(TotalProduct/TotalProductPerPage))
count=0
for i in range(int(TotalPages)):
count+=1
i='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp='+str(count)
Request_Data=requests.get(i)
Soups_Data=BeautifulSoup(Request_Data.text,'lxml')
AllProduct=Soups_Data.find_all('div',class_='productDetail')
for x in AllProduct:
AllProductDeatil = x.find('a').get("gaeepd")
IDProductLink = json.loads(AllProductDeatil)["id"]
#Scrape ProductID
ProductID = x.find('span','code').get_text(strip=True)
pattren = r'[A-Z]{2}\d{5}|\d{5}|....\d{5}'
regex = re.compile(pattren)
ProDuctIDResult = regex.findall(ProductID)
ProductIDStr = ConvertListToStr(ProDuctIDResult)
ProductIDAll.append(ProductIDStr)
#Scrape Stock
URL_Prefix =requests.get('https://www.arduinothai.com/product/'+str(IDProductLink))
SoupStock = BeautifulSoup(URL_Prefix.text, 'lxml')
ChkStock = SoupStock.find('span', class_='num').text
StockOfProduct.append(ChkStock)
if((ProductCategory_jsonData==('Single Set')) or (ProductCategory_jsonData==('Triple Set')) or (ProductCategory_jsonData==('STM32'))):
ListOfProduct.append((ProductIDStr, NameOfProduct, PriceOfProduct, OldProPricesStr, ChkStock, Link_URL, ProductCategory_jsonData))
data_df = pd.DataFrame({
'ProductID': ProductIDAll,
'ProdcutName':Productname,
'Productprice':Productprice,
'OldProductPrice': OldProductPrice,
'StockOfProduct': StockOfProduct,
'Link': LinkProduct,
'Category':CategoryProduct
})
df=pd.DataFrame(ListOfProduct, columns=['ProductID', 'ProductName','Discount','Price','Stock','Link','TypeOfProduct'])
pd.set_option('display.max_rows', df.shape[0]+1)
df
Solution
Just run the code for the two page urls:
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
for i in [1,2]:
URL='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp=' + str(i)
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
# your scrape here
Answered By - Joshua
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.