Issue
I am working on certain stock-related projects where I have had a task to scrape all data on a daily basis for the last 5 years. i.e from 2016 to date. I particularly thought of using selenium because I can use crawler and bot to scrape the data based on the date. So I used the use of button click with selenium and now I want the same data that is displayed by the selenium browser to be fed by scrappy. This is the website I am working on right now. I have written the following code inside scrappy spider.
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016', up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(z, z + 1):
driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
self.url = driver.page_source
yield Request(url=self.url, callback=self.parse)
def parse(self, response, **kwargs):
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Update: Error after answer is
2022-01-14 14:11:36 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
I want to send current web html content to scrapy feeder but I am getting unusal error for past 2 days any help or suggestions will be very much appreciated.
Solution
The 2 solutions are not very different. Solution #2 fits better to your question, but choose whatever you prefer.
Solution 1 - create a response with the html's body from the driver and scraping it right away (you can also pass it as an argument to a function):
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver = webdriver.Chrome()
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
self.body = driver.page_source
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
# return an empty requests list
return []
Solution 2 - with super simple downloader middleware:
(You might have a delay here in parse
method so be patient).
import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
class SeleniumMiddleware(object):
def process_request(self, request, spider):
url = spider.driver.current_url
body = spider.driver.page_source
return HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'tempbuffer.spiders.yetanotherspider.SeleniumMiddleware': 543,
# 'projects_name.path.to.your.pipeline': 543
}
}
driver = webdriver.Chrome()
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
self.driver.get(
"https://merolagani.com/Floorsheet.aspx")
self.driver.find_element(By.XPATH, "//input[@name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
self.driver.find_element(By.XPATH, "(//a[@title='Search'])[3]").click()
total_length = self.driver.find_element(By.XPATH,
"//span[@id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
self.driver.find_element(By.XPATH, "(//a[@title='Page {}'])[2]".format(data)).click()
self.body = self.driver.page_source
self.url = self.driver.current_url
yield Request(url=self.url, callback=self.parse, dont_filter=True)
def parse(self, response, **kwargs):
print('test ok')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Notice that I've used chrome so change it back to firefox like in your original code.
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.