Issue
I got that error after running a python scrapy. It shows that that scrapy got everything in place and started to crawl, but it stops immediately before crawling the first page.
I tried to it several times with diffrent settings but I got the same below result.
2019-10-05 04:48:32 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: booking1)
2019-10-05 04:48:32 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.3, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 2.7.14 (v2.7.14:84471935ed, Sep 16 2017, 20:19:30) [MSC v.1500 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g 2 Nov 2017), cryptography 2.1.4, Platform Windows-10-10.0.14393
2019-10-05 04:48:32 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'booking1.spiders', 'LOG_LEVEL': 'INFO', 'SPIDER_MODULES': ['booking1.spiders'], 'BOT_NAME': 'booking1', 'COOKIES_ENABLED': False, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-10-05 04:48:32 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-10-05 04:48:33 [scrapy.middleware] INFO: Enabled item pipelines:
['booking1.pipelines.MoveImagesPipeline', 'booking1.pipelines.MysqlPipeline']
2019-10-05 04:48:33 [scrapy.core.engine] INFO: Spider opened
LIST
2019-10-05 04:48:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
0
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Closing spider (finished)
0
2019-10-05 04:48:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 305,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 93563,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 10, 5, 4, 48, 34, 80000),
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2019, 10, 5, 4, 48, 33, 133000)}
2019-10-05 04:48:34 [scrapy.core.engine] INFO: Spider closed (finished)
The scrapy code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from booking1.items import Booking1Item, BookingImageItem
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from scrapy import signals
import re
import hashlib
import json
import scrapy.dupefilters
class BookingComSpider(scrapy.Spider):
name = "booking.com"
allowed_domains = ["booking.com"]
start_urls = (
'https://www.booking.com/searchresults.en-gb.html?city=-3006514',
# 'http://www.booking.com/searchresults.en-gb.html?aid=376363®ion=1005',
)
langs = []
counts = 0
hotel_limits = 10000000000
all_hotel_counts = 0
urls = []
image_urls = []
checkin = '2019-12-10'
checkout = '2019-12-11'
def __init__(self):
self.mode = 'LIST' # LIST for hotel list, DATA for data of hotels, IMAGES for load images
self.start_url = 'https://www.booking.com/destination.en-gb.html'
#self.start_url = 'http://www.booking.com/destination/city/ae/dubai.html'
#self.start_url = 'https://www.booking.com/destination/city/nl/rotterdam.html'
self.run = 'full' # should be 'town' or 'country' or 'full' working only fo LIST and use self.start_url
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BookingComSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self):
print self.all_hotel_counts
def start_requests(self):
print len(self.urls)
if self.urls:
for url in self.urls:
yield scrapy.Request(
url + '?checkin={};checkout={};selected_currency=USD'.format(self.checkin, self.checkout),
self.get_hotel, meta={'url': url})
return
if self.image_urls:
yield scrapy.Request('http://booking.com', self.for_images)
return
if self.run == 'town':
callback = self.dest_hotels
if self.run == 'country':
callback = self.dest_town
if self.run == 'full':
callback = self.dest_country
rq = scrapy.Request(self.start_url, callback)
yield rq
return
def for_images(self, response):
for image in self.image_urls:
item = BookingImageItem()
item['image_url'] = image[0]
item['file_path'] = image[1]
yield item
return
def dest_country(self, response):
for link in response.xpath('//a[@class="dest-sitemap__country-anchor"]/@href').extract():
yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_town)
# break
return
def dest_town(self, response):
for link in response.xpath('//table[@class="general"]/tr/td/a[contains(@href,"/city/")]/@href').extract():
yield scrapy.Request(response.urljoin(link.split('?')[0]), self.dest_hotels)
# break
return
def dest_hotels(self, response):
#districts = response.xpath('//h3[contains(text()," Districts")]/following-sibling::table[1]//a/@href').extract()
#for district in districts:
#yield scrapy.Request(response.urljoin(district), self.check_districts)
# pass
#if not districts or 1:
# city_id = re.findall(r"b_ufi : '(-?\d+)',", response.body)[0]
# city_id = '-2960561'
#url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(city_id)
# yield scrapy.Request(url)
hotels = response.xpath('//h3[contains(text(),"Hotels")]/following-sibling::table[1]//a/@href').extract()
# print len(hotels)
i = 1
for hotel in hotels:
# hotel = 'https://www.booking.com/hotel/it/eracle-volla.en-gb.html'
url = response.urljoin(hotel.split('?')[0])
item = Booking1Item()
item['url'] = url
# item['status'] = '0'
yield item
#yield scrapy.Request(response.urljoin(url)+'?checkin=2017-05-10;checkout=2017-05-11;selected_currency=USD',
# self.get_hotel)
i += 1
if i > self.hotel_limits:
break
self.all_hotel_counts += 1
# http://www.booking.com/searchresults.en-gb.html?nflt=ht_id%3D201&city=-2960561
return
def check_districts(self, response):
district_id = response.xpath('//input[@name="district"]/@value').extract_first()
yield scrapy.Request('http://www.booking.com/searchresults.en-gb.html?district={}'.format(district_id))
return
def get_list_country(self, response):
countries = response.xpath('//h2/a/@href').extract()
for link in countries:
#link = '/country/nl.en-gb.html'
print link
country = link.split('?')[0].split('/')[-1]
print country
country_code = country.split('.')[0]
#country_code = 'nl'
url_template = 'http://www.booking.com/destinationfinder/countries{}'
yield scrapy.Request(url_template.format(link.split('?')[0]), self.get_country, meta={'code': country_code})
#break
return
def get_country(self, response):
towns = []
ids = response.xpath('//a[@class="dcard__button"]/@href').extract()
towns += [id.split('city=')[-1].split(';')[0] for id in ids]
ids = response.xpath('//div[@class="drow ddeck"]/div/div[@class="dcard_fake"]/div/@data-ufi').extract()
towns += ids
for town in towns:
url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
yield scrapy.Request(url)
# break
if len(towns) == 25:
code = response.meta['code']
url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page=2'.format(code)
rq = scrapy.Request(url, self.get_towns, meta={'page': 2, 'code': code})
yield rq
return
def get_towns(self, response):
towns = response.xpath('//div[@class="dcard_fake"]/div/@data-ufi').extract()
for town in towns:
url = 'http://www.booking.com/searchresults.en-gb.html?city={}'.format(town)
yield scrapy.Request(url)
if len(towns) == 25:
code = response.meta['code']
page = response.meta['page'] + 1
url = 'http://www.booking.com/destinationfinder/ufis.en-gb.html?countries={};page={}'.format(code, page)
rq = scrapy.Request(url, self.get_towns, meta={'page': page, 'code': code})
yield rq
def parse(self, response):
#url = 'http://www.booking.com/hotel/nl/reinade-de-luxe.html'
for row in response.xpath('//div[@class="hotellist sr_double_search"]/div/div'):
link = row.xpath('.//a[@class="hotel_name_link url"]/@href').extract_first()
rait_text = row.xpath('.//div[@class="reviewFloater"]/div/a[1]/span/text()[normalize-space(.)] | '
'.//div[@class="reviewFloater"]/div/a[1]//p[contains(@class,"review-score-word")]'
'/text()[normalize-space(.)]').extract_first()
hotel_type = row.xpath('.//span[@class="sr-hotel__type"]/text()').extract_first()
if not hotel_type:
hotel_type = u'Hotel'
if link:
yield Request(response.urljoin(link.split('?')[0] + '?checkin=2017-03-01;checkout=2017-03-02'),
self.get_hotel, meta={'rait_text': rait_text, 'hotel_type': hotel_type})
# break
sep = response.xpath('//span[@class="sr-separator-count"][contains(text(),"other choices")]')
if sep:
return
for next_page in response.xpath('//a[@class="sr_pagination_link"]/@href').extract():
#pass
args = next_page.split(';')
myargs = '?'
for arg in args:
if 'redirected' not in arg and ('rows' in arg or 'city' in arg or 'offset' in arg):
myargs += arg + ';'
link = 'http://www.booking.com/searchresults.en-gb.html' + myargs
yield Request(link)
return
def get_hotel(self, response):
# if response.status == 404:
# open('out.html', 'w').write(response.body)
#stars = response.xpath('//span[@class="hp__hotel_ratings__stars"]/i/@title').extract_first()
#if not stars:
# print response.xpath('//span[@class="hp__hotel_ratings__stars"]').extract()
#st = stars.split('-')[0]
#if not st:
images = re.findall(r"large_url: '(.*?)'", response.body)
if not images:
images = response.xpath('//div[@id="photos_distinct"]/a[contains(@href,"http")]/@href').extract()
if not images:
item = Booking1Item()
item['url'] = response.meta['url']
item['status'] = '-1'
yield item
return
# print response.xpath('//div[@class="room-info"]')
#coods = response.xpath('//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
# print coods
#if not coods or 1:
# open('hotel.html', 'w').write(response.body)
l = BookingItemLoader(item=Booking1Item(), response=response)
l.add_xpath('name', '//h2[@id="hp_hotel_name"]/text()')
l.add_xpath('location', '//div[@id="breadcrumb"]/div[position() > 1]/a/text() | '
'//div[@id="breadcrumb"]/div[last()]/text()[normalize-space(.)]')
l.add_xpath('hotel_type', '//div[@id="viewmorealt"]/a/text()')
l.add_xpath('rait_overall_text', '//p[contains(@class, "review-score-word")]/text() | '
'//div[@id="reviewFloater"]'
'//span[contains(@class, "js--hp-scorecard-scoreword")]/text() |'
'//span[@class="review-score-widget__text"]/text()')
l.add_xpath('address', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('street_address', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('country', '//div[@property="itemListElement"][2]/a/text()')
l.add_xpath('city', '//div[@id="breadcrumb"]/div[contains(@data-google-track, "city")]/a/text()')
l.add_xpath('zip_code', '//span[@itemprop="address"]/text() | '
'//span[contains(@class, "hp_address_subtitle")]/text()')
l.add_xpath('stars', '//span[contains(@class, "hp__hotel_ratings__stars")]/i/@title')
l.add_xpath('description', '//div[@id="summary"]/p/text()')
l.add_xpath('rating', '//div[@class="review_list_score"]/text() | '
'//div[@id="js--hp-gallery-scorecard"]/@data-review-score')
l.add_xpath('no_rating', '//p[@class="review_list_score_count"]/strong/text() |'
'//span[@class="review-score-widget__subtext"]/text()')
l.add_xpath('room_types', #'//div[@class="room-info"]/a//text() | '
#'//td[contains(@class,"rt__room-detail ")]/span/a/text() |'
#'//table[@id="maxotel_rooms"]//tr/td[@class="ftd"]/text() | '
'//a[@data-room-name-en]/@data-room-name-en') # | '
l.add_xpath('room_types', '//table[@id="maxotel_rooms"][contains(@class,"rt_no_dates")]//tr/td[1]/text()')
l.add_xpath('room_totals',
'//table[@id="maxotel_rooms"]//tr/td[2]/span[contains(@title, "Standard occupancy")]/@title')
l.add_xpath('room_totals_child', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/td[1]//span[@class="invisible_spoken"]/text()')
l.add_xpath('room_price', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/td[2]/div/strong/text()')
l.add_xpath('room_totals', '//tr[contains(@class,"room_loop_counter")][contains(@class,"maintr")]'
'/following-sibling::tr[1]/@data-occupancy')
l.add_xpath('longitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
l.add_xpath('latitude', '//img[@class="static_map_one"]/@src | //a[@data-source="map_thumbnail"]/@style')
l.add_value('images', images)
l.add_xpath('food_drink', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="3"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('outdoor', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="13"]'
'/ul/li//text()[normalize-space(.)]')
l.add_xpath('parking', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="16"]/ul/li/'
'p//text()[normalize-space(.)]')
#l.add_xpath('general_service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/ul/li/text()')
l.add_xpath('activities', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="2"]/'
'ul/li//text()[normalize-space(.)]')
# l.add_xpath('service', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('internet', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="11"]/ul/'
'li//text()[normalize-space(.)]')
l.add_xpath('general', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="1"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('miscellaneous', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="29"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('reception', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('common', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="24"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('pool', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="21"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('room_amenties', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="4"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="15"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="12"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="17"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="5"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('room_facilities', '//tr[contains(@class,"room_loop_counter1")]'
'//div[contains(@class, "iconfont_wrapper")]/span/text()[normalize-space(.)] | '
'//tr[contains(@class,"room_loop_counter1")]'
'//div[contains(@class, "rt-all-facilities-hidden")]/span/text()')
l.add_xpath('room_size', '//div[@class="info"][./strong[contains(text()," size:")]]/text()[normalize-space(.)]')
l.add_xpath('included', '//div/div[contains(@class,"incExcInPriceNew")][1]/text()[normalize-space(.)]')
l.add_xpath('not_included', '//div[contains(@class,"incExcInPriceNew")][2]/text()[normalize-space(.)]')
#l.add_xpath('bathroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
#l.add_xpath('bedroom', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
#l.add_xpath('kitchen', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('cleaning', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="26"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('business', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="27"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('media', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="6"]/'
'ul/li//text()[normalize-space(.)]')
#l.add_xpath('living_area', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="7"]/ul/li/text()')
l.add_xpath('fdesk_services', '//div[contains(@class,"facilitiesChecklistSection")][@data-section-id="23"]/'
'ul/li//text()[normalize-space(.)]')
l.add_xpath('pets', '//div[@id="hotelPoliciesInc"]/div[@class="description"][./p/span="Pets"]/p/text()')
l.add_xpath('languages', '//div[contains(@class,"facilitiesChecklistSection")][last()]/div/ul/li/text()')
l.add_xpath('checkin', '//div[@id="hotelPoliciesInc"]/div[@id="checkin_policy"]/p[2]/span/text()')
l.add_xpath('checkout', '//div[@id="hotelPoliciesInc"]/div[@id="checkout_policy"]/p[2]/span/text()')
l.add_xpath('children', '//div[@id="hotelPoliciesInc"]/div[@id="children_policy"]/p[not(@class)]//text()')
l.add_xpath('cards', '//div[@class="description hp_bp_payment_method"]/p[2]/'
'button/@aria-label | //div[@class="description hp_bp_payment_method"]/p[2]/'
'span[@class="creditcard_noimg no-image-payment"]/text()')
l.add_xpath('cancelation', '//div[@id="cancellation_policy"]/p[not(@class)]//text()')
l.add_xpath('most_pop_fac', '//div[contains(@class, "hp_desc_important_facilities")]/div[@class]/'
'text()[normalize-space(.)]')
l.add_xpath('closest_landmarks', '//div[@class="hp-poi-content-section closest-landmarks"]/ul/li/'
'span[1]//text()[normalize-space(.)]')
l.add_xpath('popular_landmarks', '//div[@class="hp-poi-content-section popular-landmarks"]/ul/li/'
'span[1]//text()[normalize-space(.)]')
l.add_xpath('nearby_attractions', '//div[@class="hp_region_attractions_item "]/h4/text()')
# print response.xpath('//p[@class="summary hotel_meta_style"]/text()[normalize-space(.)][2]').extract()
l.add_xpath('total_rooms', '//p[@class="summary hotel_meta_style"]/text()[normalize-space(.)][2] |'
'//p[@class="summary hotel_meta_style"]/strong/text() |'
'//p[@class="summary hotel_meta_style"]/a/text()')
l.add_value('url', response.meta['url'])
#l.add_value('image_urls', images)
#l.add_value('image_names', images)
l.add_value('status', '1')
yield l.load_item()
#print response.xpath('//div[@class="checkin_policy"]/p/text()').extract()
#if 'rait_overall_text' not in item:
# open('notst.html', 'w').write(response.body)
#yield item
# print response.xpath('//div[@id="reviewFloater"]//span[contains(@class, "js--hp-scorecard-scoreword")]/text()').extract()
#url = 'http://www.booking.com/hotel_history_ufi?ufi=-2960561¤cy=RUB&lang=en&aid=304142&sid=4c031dbed9125512ac8b2caba783d17c&stype=1&stid=304142&label=gen173nr-1DCAsowgFCEWFyYXJhdC1wYXJrLWh5YXR0SAliBW5vcmVmaMIBiAEBmAEuuAEPyAEP2AED6AEB-AEDqAID&checkin=2017-03-01&checkout=2017-03-02&_=1485632824219'
#headers = {'X-Requested-With': 'XMLHttpRequest'}
#yield scrapy.Request(url, self.get_score, meta={'item': item}, headers=headers)
# langs = self.langs[:]
# lang = langs.pop()
# url = response.url.replace('en-gb', '{}'.format(lang))
# rq = scrapy.Request(url, self.get_lang)
# rq.meta['lang'] = lang
# rq.meta['langs'] = langs
# rq.meta['item'] = item
# rq.meta['url'] = response.url
# yield rq
# return
def get_score(self, response):
jdata = json.loads(response.body)
item = response.meta['item']
item['rait_overall_text'] = jdata[0]['rait_overall_text']
return item
def get_lang(self, response):
lang = response.meta['lang']
langs = response.meta['langs']
item = response.meta['item']
url = response.meta['url']
item['name_{}'.format(lang.replace('-', '_'))] = response.xpath('//h1//text()').extract_first().strip()
item['address_{}'.format(lang.replace('-', '_'))] = \
response.xpath('//span[@itemprop="address"]/text()').extract_first().strip()
if not langs:
yield item
return
lang = langs.pop()
url = url.replace('en-gb', '{}'.format(lang))
rq = scrapy.Request(url, self.get_lang)
rq.meta['url'] = response.meta['url']
rq.meta['lang'] = lang
rq.meta['langs'] = langs
rq.meta['item'] = item
yield rq
return
class BookingItemLoader(ItemLoader):
def get_zip(value):
val = re.findall(r"(\d\d\d\d.?.?.?) ", value.strip())
# print val
# m = re.search('\d\d\d\d.?.?.? ', value.strip())
#print m.group()
# val = m.group(0)
if val:
val = val[-1].strip()
else:
val = ''
return val
def get_child(value):
if 'children' in value:
return '1'
else:
return '0'
def fix_rooms(self, value):
f = 1
rooms = value[:]
if len(rooms) %2 == 0:
i = 0
while i < len(rooms):
if rooms[i] == rooms[i + 1]:
del rooms[i]
else:
f = 0
break
i += 1
if f:
return '; '.join(value)
else:
return '; '.join(rooms)
default_output_processor = TakeFirst()
default_input_processor = MapCompose(unicode.strip)
# hotel_type_in = MapCompose()
hotel_type_in = MapCompose(lambda value: re.findall(r'(.*?)s in |(.*)', value)[0]) # search 'atnm' in source also
location_out = Join('; ')
images_in = MapCompose()
description_out = Join()
images_out = MapCompose()
room_types_out = fix_rooms
room_totals_in = MapCompose(lambda value: value.split()[-1])
room_totals_out = Join('; ')
room_totals_child_in = MapCompose(get_child)
room_totals_child_out = Join('; ')
room_price_out = Join('; ')
This is the most part of code.
Solution
Tried with your code, it goes to the dest_country
callback, but there it doesn't find any link so it just exits.
Maybe an issue with an xpath expression somewhere?
Answered By - Guillaume
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.