Issue
I'm scraping from 2 differnt tables and want to combine the data into MongoDB
Now I have a problem with the second table I want to scrape. The Table has 1 table header and 5 table rows how can I scrape the table that the MongoDB field has all the elements(column) of the table
The table i want to scrape looks like this https://codepen.io/linkslegend/pen/JjPrqLq
This is the code I have sofar
import scrapy
import pymongo
from ..items import testItem
class IssSpider(scrapy.Spider):
name = "test_spider"
start_urls = ["https://de.iss.fst.com/dichtungen/radialwellendichtringe/rwdr-mit-geschlossenem-kafig/ba"]
def parse(self, response):
self.log("I just visted:" + response.url)
urls = response.css('.details-button > a::attr(href)').extract()
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
# follow paignation link
next_page_url = response.css('li.item > a.next::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
item = testItem()
item['Artikelnummer'] = response.css('td[data-th="Artikelnummer"]::text').extract_first().strip(),
item['Hersteller'] = response.css('td[data-th="Hersteller"]::text').extract_first().strip(),
item['Materialvariante'] = response.css('td[data-th="Materialvariante"]::text').extract_first().strip(),
item['Material'] = response.css('td[data-th="Material"]::text').extract_first().strip(),
item['Gewicht_Gramm'] = response.css('td[data-th="Gewicht (Gramm)"]::text').extract_first().strip(),
item['Gehaeusedurchmesser'] = response.css('td[data-th="Gehäusedurchmesser"]::text').extract_first().strip(),
item['Breite'] = response.css('td[data-th="Breite"]::text').extract_first().strip(),
item['Innendurchmesser'] = response.css('td[data-th="Innendurchmesser"]::text').extract_first().strip(),
item['Wellendurchmesser'] = response.css('td[data-th="Wellendurchmesser"]::text').extract_first().strip(),
item['Außendurchmesser'] = response.css('td[data-th="Außendurchmesser"]::text').extract_first().strip(),
for row in response.css('tr.offer'):
item['Lieferant'] = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
item['Anforderungsmenge'] = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
item['Lieferzeit'] = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
item['PreisproStueck'] = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
yield item
and this is the pipeline for mongodb
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class testPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient(
"localhost",
27017
)
db = self.conn["test_db"]
self.collection = db["test_tb"]
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
my current MongoDB looks like this
This is how I want MongoDB to look like
thanks for any help!
Solution
I think the problem is that you're overwriting the values for Lieferant, Anforderungsmenge, Lieferzeit & PreisproStueck in your loop. You can try substituting it with the following:
def parse_details(self, response):
item = testItem()
...
lieferants = []
anforderungsmenges = []
lieferzeits = []
preisprostuecks = []
for row in response.css('tr.offer'):
lieferant = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
anforderungsmenge = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
lieferzeit = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
preisprostueck = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
lieferants.append(lieferant)
anforderungsmenges.append(anforderungsmenge)
lieferzeits.append(lieferzeit)
preisprostuecks.append(preisprostueck)
item['lieferants'] = lieferants
item['anforderungsmenges'] = anforderungsmenges
item['lieferzeits'] = lieferzeits
item['preisprostuecks'] = preisprostuecks
yield item
You can also get the lists directly instead of looping over them, similar to the below (untested) code:
def parse_details(self, response):
item = testItem()
...
item['lieferants'] = [lieferant.strip() for lieferant in response.css('tr.offer > td.vendor > span.offer-vendor::text').extract()]
item['anforderungsmenges'] = [anforderungsmenge.strip() for anforderungsmenge in response.css('td.replenishment-time > span.offer-replenishment-time::text').extract()]
item['lieferzeits'] = [lieferzeit.strip() for lieferzeit in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
item['preisprostuecks'] = [preisprostueck.strip() for preisprostueck in response.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract()]
yield item
Answered By - Wim Hermans
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.