Issue
Im setting up a proxy grabber from one site, but im getting nothing.
import scrapy
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
class ProxyServersPro(Item):
ip = scrapy.Field()
port = scrapy.Field()
country = scrapy.Field()
speed = scrapy.Field()
protocol = scrapy.Field()
anon = scrapy.Field()
class ProxyServersPro(CrawlSpider):
name = "ProxyServersProCrawler"
start_urls = ["https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1"]
allowed_domains = ['proxyservers.pro']
rules = {
Rule(LinkExtractor(allow=r'page'), callback = 'parse_item')
}
def parse_item(self, response):
item = ItemLoader (ProxyServersPro(), response=response)
item.add_xpath('ip', '//*[@id="content-content"]/div/div/div[1]/table/tbody/tr[1]/td[2]/a/text()')
item.add_xpath('port', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[3]/span/text()')
item.add_xpath('country', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[4]/text()')
item.add_xpath('speed', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[5]/div[1]/div/div/text()')
item.add_xpath('protocol', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[7]/text()')
item.add_xpath('anon', '//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[8]/text()')
return item.load_item()
and this is what the console says.
2019-03-24 04:53:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
can someone figure out what is going on? thanks
Solution
Here is the working example code please have a look
# -*- coding: utf-8 -*-
from scrapy import Selector
from scrapy.http import Request, FormRequest, HtmlResponse
from scrapy.spiders import CrawlSpider
from scrapy.conf import settings
import urllib
import json
import re
from urllib.parse import urljoin
from html.parser import HTMLParser
from requests import Session
from scrapy import Item, Field
class ProxyServersPro(Item):
ip = Field()
port = Field()
country = Field()
speed = Field()
protocol = Field()
anon = Field()
port = Field()
class ProxyServers(CrawlSpider):
name = "ProxyServersProCrawler"
allowed_domains = ['proxyservers.pro']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
}
start_url = ['https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5']
def __init__(self):
super(ProxyServers, self).__init__()
def start_requests(self):
for url in self.start_url:
yield Request(url, callback=self.parse_companies, headers=self.headers)
def parse_companies(self, response):
table = response.xpath('//table[@class="table table-hover"]/tbody/tr')
for data in table:
ip = data.xpath('./td[2]/a/text()').extract_first()
country = data.xpath('./td[4]/text()').extract_first()
protocol = data.xpath('./td[7]/text()').extract_first()
anon = data.xpath('./td[8]/text()').extract_first()
port = data.xpath('./td[3]/text()').extract_first()
item = ProxyServersPro()
item['ip'] = ip
item['country'] = country
item['protocol'] = protocol
item['anon'] = anon
item['port'] = port
yield item
also port and speed is not in the content of the website, it is loading instantly we can not get through xpath.
Answered By - Alpha Romeo
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.