Issue
I am trying to parse information from multiple pages using scrapy. But it doesn't seem to yield an item after it is finished. What could be the issue?
class TransfersSpider(scrapy.Spider):
name = "transfers"
start_urls = 'https://www.transfermarkt.com/kevin-de-bruyne/profil/spieler/88755'
def parse(self, response):
item = PlayerTransfersItem()
info_table = response.css('div[class="info-table info-table--right-space "]') or response.css('div[class="large-6 large-pull-6 small-12 columns spielerdatenundfakten"]')
item["name"] = response.xpath('//h1/text()').get().strip() + " " + response.xpath('//h1//following-sibling::strong/text()').get(default = "")
stats_url = response.url.replace('profil', 'leistungsdaten') #this url will be used to find the all seasons this player played in
yield scrapy.Request(stats_url, callback= self.parse_played_seasons, cb_kwargs={"item": item})
def parse_played_seasons(self, response, item): #Obtain every season the player has played in
item["seasons_stats"] = list()
seasons = response.css('div[class="inline-select"] > select[name="saison"] >option::attr(value)').getall() # Total seasons player played in
for season in seasons: # parse stats from each season
url = f"{response.url}/plus/0?saison={season}"
yield scrapy.Request(url, callback=self.parse_season, cb_kwargs= {"item": item, "season": season})
yield item #This returns a None value
def parse_season(self, response, item, season):
tables = response.css('div[class="box"] > div[class="responsive-table"]')
total_table = tables[0].css('tr> td::text').get()
if "Total" in total_table: #If there is a table with a row shwoing total results
appearances_total_table = tables[0].css('tr> td[class="zentriert"]::text').get()
goals_total_table = tables[0].css('tr> td[class="zentriert"]::text')[1].get()
assists_total_table = tables[0].css('tr> td[class="zentriert"]::text')[2].get()
season_stats = { season:{"total_goals": goals_total_table,"total_assists" : assists_total_table, "minutes_played": minutes_total_table,
"appearances": appearances_total_table}}
item["seasons_stats"].append(season_stats)
I want to get the stats of the player in each season, so why does it return a none value. But when I place yield in the parse_season
function, it return duplicates of the item in each season.
Solution
First add this function to your pipeline. It will add all the season_stats
of players with the same name.
import json
def combine(L):
results = {}
for item in L:
key = (item["name"])
if key in results: # combine them
total_ = item["season_stats"] + results[key]["season_stats"]
total= [i for n, i in enumerate(total_) if i not in total_[n + 1:]]
results[key] = {"name": item["name"], "season_stats":total }
else: # don't need to combine them
results[key] = item
return list(results.values())
Then modify your pipeline to append them to a list where this function will be performed, then dumped to a json.
class Pipeline:
def __init__(self):
self.players = []
def process_item(self, item, spider):
self.players.append(item)
#return item
def close_spider(self, spider):
print(self.players)
with open("myjsonfile.json", "wt") as fd:
json.dump(combine(self.players), fd)
Output json file:
[{"name": "#9 Erling Haaland", "season_stats":
[{"2021": {"Appearances": "30", "Goals": "29", "Assists": "8"}},
{"2020": {"Appearances": "41", "Goals": "41", "Assists": "12"}},
{"2022": {"Appearances": "10", "Goals": "14", "Assists": "1"}}]},
{"name": "#17 Kevin De Bruyne", "season_stats":
[{"2020": {"Appearances": "40", "Goals": "10", "Assists": "18"}},
{"2021": {"Appearances": "45", "Goals": "19", "Assists": "14"}},
{"2022": {"Appearances": "10", "Goals": "1", "Assists": "8"}}]}]
Answered By - Baraa Zaid
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.