Issue
I am trying to scrape a github repo.
I want to extract all XML file urls at level1 of every repo and in the best case also extract information from the XML files.
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
repo_rule = Rule(
LinkExtractor(
restrict_xpaths="//a[@itemprop='name codeRepository']",
restrict_text=r"ELTeC-.+"
)
)
pagination_rule = Rule(
LinkExtractor(restrict_xpaths="//a[@class='next_page']")
)
level_rule = Rule(
LinkExtractor(allow=r"/level1"),
follow=True,
callback="parse_level"
)
class ELTecSpider(CrawlSpider):
"""Scrapy CrawlSpider for crawling the ELTec repo."""
name = "eltec"
start_urls = ["https://github.com/orgs/COST-ELTeC/repositories"]
rules = [
repo_rule,
pagination_rule,
level_rule,
]
def parse_level(self, response):
print("INFO: ", response.url)
process = CrawlerProcess(
settings={
"FEEDS": {
"items.json": {
"format": "json",
"overwrite": True
},
},
}
)
process.crawl(ELTecSpider)
process.start()
The above extracts the responses for all level1 folders, but somehow I am stuck at this point. My plan was to go down every level1 url using callbacks like so:
def parse_level(self, response):
yield scrapy.Request(response.url, callback=self.parse_docs)
def parse_docs(self, response):
docs_urls = response.xpath("//a[@class='Link--primary']")
for url in docs_urls:
print("INFO: ", url)
But apparently the callback never even fires.
What am I doing wrong?
Solution
scrapy
remembers visited pages and it skips scraping the same url
again.
This way it doesn't waste time to get the same page again, and it also prevents crawling loops.
When you run scrapy.Request(response.url, ...)
then you try to scrape the same url
again and scrapy
skips it.
If you really need to scrape the same page again then you may need
Request(..., dont_filter=True)
(Doc: scrapy.http.Request)
I would rather run directly
yield self.parse_docs(response)
There is another problem inside parse_doc()
. xpath
doesn't find any element - so for
-loop doesn't run any print()
. You should add extra print()
at the beginning of parse_doc()
to see when it was executed.
xpath
may not find class='Link--primary'
because this page uses JavaScript
to add elements. And this may need to use Selenium and module scrapy-selenium
to control real web browser which can run JavaScript
. scrapy
has also Splash and scrapy-splash to work with JavaScript
.
(Doc: Selecting dynamically-loaded content)
Maybe GitHub
has some API to get information without scraping.
EDIT:
Full working code which use scrapy-selenium
and Selenium 3
.
It doesn't work with Selenium 4
because scrapy-selenium
wasn't updated since 2020 and it doesn't work with newest Selenium 4
pip install scrapy-selenium
pip install 'selenium<4'
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#from shutil import which # to run `which('firefox')` or `which('chrome')` in settings
repo_rule = Rule(
LinkExtractor(
restrict_xpaths="//a[@itemprop='name codeRepository']",
restrict_text=r"ELTeC-.+"
)
)
pagination_rule = Rule(
LinkExtractor(restrict_xpaths="//a[@class='next_page']")
)
level_rule = Rule(
LinkExtractor(allow=r"/level1"),
follow=True,
callback="parse_level"
)
class ELTecSpider(CrawlSpider):
"""Scrapy CrawlSpider for crawling the ELTec repo."""
name = "eltec"
start_urls = [
"https://github.com/orgs/COST-ELTeC/repositories",
"https://github.com/COST-ELTeC/ELTeC-lit/tree/master/level1"
]
rules = [
repo_rule,
pagination_rule,
level_rule,
]
def parse_level(self, response):
print("\n>>> PARSE LEVEL:", response.url)
#yield scrapy.Request(response.url, callback=self.parse_docs, dont_filter=True)
yield SeleniumRequest(url=response.url, callback=self.parse_docs, dont_filter=True,
wait_time=10,
#wait_until=EC.element_to_be_clickable((By.CLASS_NAME, 'Link--primary'))
wait_until=EC.presence_of_element_located((By.CLASS_NAME, 'Link--primary'))
)
def parse_docs(self, response):
print("\n>>> PARSE DOC:", response.url)
docs_urls = response.selector.xpath("//a[@class='Link--primary']")
#print("\n>>> LEN:", len(docs_urls))
for url in docs_urls:
text = url.xpath('.//text()').get()
href = url.xpath('.//@href').get()
#print("\n>>> INFO:", href, text)
yield {"text": text, "url": href}
process = CrawlerProcess(
settings={
"FEEDS": {
"items.json": {
"format": "json",
"overwrite": True
},
},
'SELENIUM_DRIVER_NAME': 'firefox', # or 'chrome'
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver', # or which('geckodriver'), which('chromedrive')
'SELENIUM_DRIVER_ARGUMENTS': ['-headless'], # '--headless' if using `chrome` instead of `firefox`
#'SELENIUM_DRIVER_ARGUMENTS': [], # needs at least empty list
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},
}
)
process.crawl(ELTecSpider)
process.start()
Result from items.json
[
{"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
{"text": "LIT00001_seinius_kuprelis.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00001_seinius_kuprelis.xml"},
{"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
{"text": "LIT00002_pietaris_algimantas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00002_pietaris_algimantas.xml"},
{"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
{"text": "LIT00004_dobilas_bludas.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00004_dobilas_bludas.xml"},
{"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
{"text": "LIT00005_daukantas_zemaiciu_pasakos.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00005_daukantas_zemaiciu_pasakos.xml"},
{"text": "LIT00006_kudirka_virsininkai.xml", "url": "/COST-ELTeC/ELTeC-lit/blob/master/level1/LIT00006_kudirka_virsininkai.xml"},
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.