I'm trying to get all the xml file links from this domain. When I use the scrapy shell, I get the relative link I am expecting.
>>> response.xpath('//div[@class="toolbar"]/a[contains(@href, ".xml")]/@href').extract()[1]
But when I try to yield all the links, I end up with a csv that has all incomplete links or just the root link many times over.
Example dataset:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['']
start_urls = ['']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
def parse_xml(self, response):
xmllinks = response.xpath('//div[@class="toolbar"]/a[contains(@href, ".xml")]/@href').extract()[1]
for link in xmllinks:
'file_urls': [response.urljoin(link)]
What am I missing in my urljoin
that's creating these incomplete and/or root links?
scrapes data from each of the detail page and your selection select two elements but you have to select only one, then you can apply the built-in indexing of xpath expression to avoid unnecessary for loop.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['']
start_urls = ['']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
def parse_xml(self, response):
xmllink = response.xpath('(//div[@class="toolbar"]/a[contains(@href, ".xml")]/@href)[1]').get()
'file_urls': response.urljoin(xmllink)
{'file_urls': ''}
2022-12-14 20:28:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET> (referer:
2022-12-14 20:28:58 [scrapy.core.scraper] DEBUG: Scraped from <200>
{'file_urls': ''}
2022-12-14 20:29:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET> (referer:
2022-12-14 20:29:03 [scrapy.core.scraper] DEBUG: Scraped from <200>
{'file_urls': ''}
2022-12-14 20:29:03 [scrapy.core.engine] INFO: Closing spider (finished)
2022-12-14 20:29:03 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 242004,
'downloader/request_count': 754,
'downloader/request_method_count/GET': 754,
'downloader/response_bytes': 69368110,
'downloader/response_count': 754,
'downloader/response_status_count/200': 754,
'dupefilter/filtered': 3221,
'elapsed_time_seconds': 51.448049,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 12, 14, 14, 29, 3, 317586),
'item_scraped_count': 697,
... so on
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DhqSpider(CrawlSpider):
name = 'dhq'
allowed_domains = ['']
start_urls = ['']
rules = (
Rule(LinkExtractor(allow = 'index.html')),
Rule(LinkExtractor(allow = 'vol'), callback='parse_xml'),
def parse_xml(self, response):
#xmllink = response.xpath('(//div[@class="toolbar"]/a[contains(@href, ".xml")]/@href)[1]').get()
#'file_urls': response.urljoin(xmllink)
yield {
'title' : response.css('h1.articleTitle::text').get().strip().replace('\n', ' ').replace('\t',''),
'author' : response.css(' a::text').get().strip(),
'pubinfo' : response.css('div#pubInfo::text').getall(),
'xmllink' :response.urljoin( response.xpath('(//div[@class="toolbar"]/a[contains(@href, ".xml")]/@href)[1]').get()),
#'referrer_url' : response.url
"title": "Textension: Digitally Augmenting Document Spaces in Analog Texts",
"author": "Adam James Bradley",
"pubinfo": [
"Volume 13 Number 3"
"xmllink": ""
"title": "Building the",
"author": "Cait Coker",
"pubinfo": [
"Volume 13 Number 3"
"xmllink": ""
"title": "Dendrography and Art History: a computer-assisted analysis of Cézanne’s",
"author": "Melinda Weinstein",
"pubinfo": [
"Volume 13 Number 3"
"xmllink": ""
"title": "The Invisible Work of the Digital Humanities Lab: Preparing Graduate Students for Emergent Intellectual and Professional Work",
"author": "Dawn Opel",
"pubinfo": [
"Volume 13 Number 2"
"xmllink": ""
"title": "Modelling Medieval Hands: Practical OCR for Caroline Minuscule",
"author": "Brandon W. Hawk",
"pubinfo": [
"Volume 13 Number 1"
"xmllink": ""
"title": "Introduction: Questioning",
"author": "Tarez Samra Graban",
"pubinfo": [
"Volume 13 Number 2"
"xmllink": ""
"title": "Racism in the Machine: Visualization Ethics in Digital Humanities Projects",
"author": "Katherine Hepworth",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Narrelations — Visualizing Narrative Levels and their Correlations with Temporal Phenomena",
"author": "Hannah Schwan",
"pubinfo": [
"Volume 13 Number 3"
"xmllink": ""
"title": "Towards 3D Scholarly Editions: The Battle of Mount Street Bridge",
"author": "Costas Papadopoulos",
"pubinfo": [
"Volume 13 Number 1"
"xmllink": ""
"title": "Visual Communication and the promotion of Health: an exploration of how they intersect in Italian education",
"author": "Viviana De Angelis",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Best Practices: Teaching Typographic Principles to Digital Humanities Audiences",
"author": "Amy Papaelias",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Placing Graphic Design at the Intersection of Information Visualization Fields",
"author": "Yvette Shen",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Making and Breaking: Teaching Information Ethics through Curatorial Practice",
"author": "Christina Boyles",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Critically engaging with data visualization through an information literacy framework",
"author": "Steven Braun",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Renaissance Remix.",
"author": "Deanna Shemek",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
"title": "Crowdsourcing Image Extraction and Annotation: Software Development and Case Study",
"author": "Ana Jofre",
"pubinfo": [
"Volume 14 Number 2"
"xmllink": ""
"title": "Defining scholarly practices, methods and tools in the Lithuanian digital humanities research community",
"author": "Ingrida Kelpšienė",
"pubinfo": [
"Volume 12 Number 4"
"xmllink": ""
Answered By - Md. Fazlul Hoque
Post a Comment
Note: Only a member of this blog may post a comment.