Issue
I am running scrapy spider using flask and crochet. In this I am using Rule Link extractor for defining the rules. In the rule, I am setting allow_domains
which is passed from flask application.
spider.py
class myCrawler(CrawlSpider):
name = 'symphony'
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
super().__init__(**kwargs)
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:])),
process_links=process_links,
callback='parse_item',
follow=True
),
)
Here I gave LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:]))
. But self
is not defined there which throws an error. So how can I assign the value of expression '.'.join(urlparse(self.base_url).netloc.split('.')[-2:])
to the variable allow_domains
[which is also same as self.allowed_domains
] Or else is there any better method to do this
Solution
The problem here is that CrawlSpider constructor (init) is also handling the rules parameter, so if we need to assign them, we'll have to do it before calling the default constructor.
class myCrawler(CrawlSpider):
name = 'symphony'
rotate_user_agent = True
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
process_links=process_links,
callback='parse_item',
follow=True
),
)
super().__init__(**kwargs)
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
Answered By - imhans4305
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.