Issue
I am trying to extract html information from a group of URLs that have been archived on a particular website. I need to loop this to extract from all URLs in a list
#Finding the specific p class that contains daily information
dailydata = (soup.find('p', attrs={"class": "style22"}).get_text())
dailydata
#Getting rid of the \n's and \t's in the string
to_replace =['\n\t\t\t\t\t\t','\n\t\t\t\t\t\t ','\xa0',' ',' ']
for rep1 in to_replace:
dailydata = dailydata.replace(rep1, '_')
rep2 = ['__','_']
for r in rep2:
dailydata = dailydata.replace(r, '_')
#Splitting the string into lists
splitdd = dailydata.split('_')
#Finding the specific division class that contains the report information
reportinfo = (soup.find('div', attrs={"class": "style18"}).get_text())
#Getting rid of \n
reportinfo.replace('\n',' ')
reportinfo = reportinfo.split(' ')
#Finding the specific p class that contains title information for daily data
titles = list(soup.find('p', attrs={"class": "style21"}).get_text())
titles = titles.replace('\n\t\t\t\t\t\t', ' ')
#Constructing necessary data tables
data1 = {'Date':[reportinfo[3]],
'Water Temp (Little River)':[splitdd[0] +' degrees Fahrenheit'],
'Stream Flow Rate':[splitdd[2] + ' Feet ' + splitdd[4] + ' CF/s'],
'Sunrise':[splitdd[6]],
'Sunset':[splitdd[7]],
'Rainfall 2023 YTD Knoxville Apt':[splitdd[8]],
'Rainfall Normal YTD Knoxville Apt':[splitdd[9]]
}
data_column_index = ['Date',
'Water Temp (Little River)',
'Stream Flow Rate',
'Sunrise',
'Sunset',
'Rainfall 2023 YTD Knoxville Apt',
'Rainfall Normal YTD Knoxville Apt'
]
dates = [reportinfo[3]]
reoccuring_data = [[reportinfo[3]],
[splitdd[0] +' degrees Fahrenheit'],
[splitdd[2] + ' Feet ' + splitdd[4] + ' CF/s'],
[splitdd[6]],
[splitdd[7]],
[splitdd[8]],
[splitdd[9]]
]
#Calling data to put into dataframe
df1 = pd.DataFrame(data = data1, index = dates, columns = data_column_index)
reoccurdf = pd.DataFrame(data = reoccuring_data)
reoccurdf
df1 = pd.concat([df1, df1], ignore_index = True)
#Saving to csv file
df1.to_csv(save_path + 'output.csv', mode='a', index=False, header = False)
Here is my 'archive' scrape program. I can't figure out how to loop the HTML scrape for a single day (code above) to extract information from all URLs in a list. Below is my "archive scraping program" I have the URLs I need to scrape in a list.
import os
from bs4 import BeautifulSoup as bs
from datetime import date, timedelta
import requests
import pprint
import re
import pyperclip
#We need from 2017.11.1 to 2023.10.26
#The format for this particular html is 'mmddyy'
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2017,11,1)
end_date = date(2023,10,30)
archive_date_range_list = []
for single_date in daterange(start_date, end_date):
x = (single_date.strftime("%m%d%y"))
archive_date_range_list.append(x)
new_list_dates = ','.join([x for x in archive_date_range_list])
new_list_dates = new_list_dates.split(",")
#Little River Outfitters website URL with the archive date variable
#we need the format URL1+"date"+URL2
URL1 = [r'https://littleriveroutfitters.com/WEBSITE-2008/pages/fishing/']
URL2 = [r'.htm']
#loop to configure the particular website
website_dates_list = []
for i in new_list_dates:
x1 = URL1,i,URL2
website_dates_list.append(x1)
#making the list of url that need to be scraped
pages = new_list_dates
baseurl = 'https://littleriveroutfitters.com/WEBSITE-2008/pages/fishing/'
my_url_list= [''.join(map(str, [baseurl, p, '.htm'])) for p in pages]
#Now I need to get data from URLs from my_url_lists
I have tried to run a beautiful_soup loop with no success.
Solution
Just do give you an idea how you could go - Use a while loop and add a day in each iteration, if you reach end_date
simply break
the loop.
Example
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, timedelta
import requests
start_date = date(2017,11,1)
end_date = date(2017,11,20)
data = []
while True:
start_date.strftime("%m%d%y")
url = f'https://littleriveroutfitters.com/WEBSITE-2008/pages/fishing/{start_date.strftime("%m%d%y")}.htm'
try:
soup = BeautifulSoup(requests.get(url).text)
d = soup.select('table table tr:has(img[alt*="Fishing Gauge"]) td:has(p :not(img))')
d = dict(zip(d[0].stripped_strings, d[1].stripped_strings))
d.update({'date': start_date})
data.append(d)
except:
print(start_date, url)
if start_date == end_date:
break
else:
start_date = start_date + timedelta(days=1)
pd.DataFrame(data)
Output
Water Temperature Little River | Stream Flow | Sunrise | Sunset | Rainfall 2017 YTD Knoxville Apt | Rainfall Normal YTD Knoxville Apt | date | |
---|---|---|---|---|---|---|---|
0 | 47.7 Fahrenheit | 2.16 Feet 239 CFS | 7:58 | 6:40 | 45.31" | 39.95" | 2017-11-01 |
1 | 51.8 Fahrenheit | 2.11 Feet 222 CFS | 7:59 | 6:39 | 45.31" | 39.46" | 2017-11-02 |
... | |||||||
18 | 51.1 Fahrenheit | 2.17 Feet 242 CFS | 7:16 | 5:26 | 48.47" | 41.56" | 2017-11-19 |
19 | 44.2 Fahrenheit | 1.90 Feet 159 CFS | 7:17 | 5:26 | 48.47" | 41.70" | 2017-11-20 |
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.