Issue
I have this code:
from logging import INFO
import scrapy
class LinkedInAnonymousSpider(scrapy.Spider):
name = "linkedin_anonymous"
allowed_domains = ["linkedin.com"]
start_urls = []
base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search"
def __init__(self, input=None, first=None, last=None):
self.input = input # source file name
self.first = first
self.last = last
def start_requests(self):
if self.first and self.last: # taking input from command line parameters
url = self.base_url % (self.first, self.last)
yield self.make_requests_from_url(url)
elif self.input: # taking input from file
i = 0
self.log('Input from file: %s' % self.input, INFO)
for line in open(self.input, 'r').readlines():
i += 1
if line.strip(): # no blank line
t = line.split("\t")
name = t[0]
parts = [n.strip() for n in name.split(' ')]
last = parts.pop()
first = " ".join(parts)
if first and last:
url = self.base_url % (first, last)
yield self.make_requests_from_url(url)
else:
raise Exception('No input.')
def parse(self, response):
# if there is exactly one match the person's profile page is returned
if response.xpath('//div[@class="profile-overview-content"]').extract():
yield scrapy.Request(response.url, callback=self.parse_full_profile_page)
else:
# extracting profile urls from search result
for sel in response.css('div.profile-card'):
url = sel.xpath('./*/h3/a/@href').extract()[0] # Person's full profile URL in LinkedIn
yield scrapy.Request(url, callback=self.parse_full_profile_page)
........
With this code, I get the profile details of a list of people from linkedin.
I have written such a main function in order to do that.
import scrapy
import sys
from linkedin_anonymous_spider import LinkedInAnonymousSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
if __name__ == "__main__":
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
for a in range(len(firstname)):
settings = get_project_settings()
crawler = CrawlerProcess(settings)
spider = LinkedInAnonymousSpider()
crawler.crawl(spider, [], firstname[a], lastname[a])
crawler.start()
When the loop comes to the 2nd step, I get this error:
raise error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
How can I fix the problem?
Thanks.
Solution
You can only run one reactor, so just calling crawler.start()
once.
Try passing crawler.start()
out of the loop.
Answered By - eLRuLL
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.