Issue
This is my spider and it is working but, how could I send another spider on new found URL. Now, I´m storing all links whose begin with HTTP
, HTTPS
or if it is /
I add the base URL.
Then I will iterate that array and call a new spider on the new URL (It is on the end of the code)
I can´t scrape on the new URL (I know because print()
is not showing on the console)
import scrapy
import re
class GeneralSpider( scrapy.Spider ):
name = "project"
start_urls = ['https://www.url1.com/',
'http://url2.com']
def parse( self, response ):
lead = {}
lead['url'] = response.request.url
lead['data'] = {}
lead['data']['mail'] = []
lead['data']['number'] = []
selectors = ['//a','//p','//label','//span','//i','//b','//div'
'//h1','//h1','//h3','//h4','//h5','//h6','//tbody/tr/td']
atags = []
for selector in selectors:
for item in response.xpath( selector ):
name = item.xpath( 'text()' ).extract_first()
href = item.xpath( '@href' ).extract_first()
if selector == '//a' and href is not None and href !='' and href !='#':
if href.startswith("http") or href.startswith("https"):
atags.append( href )
elif href.startswith("/"):
atags.append( response.request.url + href )
if href is not None and href !='' and href !='#':
splitted = href.split(':')
if splitted[0] not in lead['data']['mail'] and splitted[0] == 'mailto':
lead['data']['mail'].append(splitted[1])
elif splitted[0] not in lead['data']['number'] and splitted[0] == 'tel':
lead['data']['number'].append(splitted[1])
else:
if name is not None and name != '':
mail_regex = re.compile( r'^(([^<>()[\]\\.,;:\s@\"]+(\.[^<>()[\]\\.,;:\s@\"]+)*)|(\".+\"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$' )
number_regex = re.compile( r'^(?:\(\+?\d{2,3}\)|\+?\d{2,3})\s?(?:\d{4}[\s*.-]?\d{4}|\d{3}[\s*.-]?\d{3}|\d{2}([\s*.-]?)\d{2}\1?\d{2}(?:\1?\d{2})?)(?:\1?\d{2})?$' )
if name not in lead['data']['mail'] and re.match( mail_regex, name ):
lead['data']['mail'].append(name)
elif name not in lead['data']['number'] and re.match( number_regex, name ):
lead['data']['number'].append(name)
print( lead )
#I want here call parse method again but with new url
for tag in atags:
scrapy.Request( tag, callback=self.parse )
Solution
You need to return the Request object in your function.
Since you are generating multiple ones, you can use yield
, like this:
yield scrapy.Request(tag, callback=self.parse)
"In the callback function, you parse the response (web page) and return either dicts with extracted data, Item objects, Request objects, or an iterable of these objects." See scrapy documentation
Answered By - Sewake
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.