Issue
This one is an odd one I ran this code in the morning and it worked just fine on the html from page. Now when I run it the tables variable comes back 0 items so the for loop never happens and no data is collected or data frame created
def parseForclosure(pagesource):
data = []
soup = BeautifulSoup(pagesource,'html.parser')
tables = soup.find_all('table', attrs={'class':'ad_tab'})
print(len(tables))
df2 = pd.DataFrame()
for i in range(len(tables)):
print(i)
table_body = tables[i].find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
data2 ={'AuctionType': [data[0]] ,
'CaseNo': [data[1]],
'FinalJudgmentAmount': [data[2]],
'ParcelID': [data[3]],
'PropertyAddress1': [data[4]],
'PropertyAddress2': [data[5]],
'AssessedValue': [data[6]],
'PlaintiffMaxBid': [data[7]]}
df = pd.DataFrame(data2, columns=['AuctionType','CaseNo','FinalJudgmentAmount','ParcelID','PropertyAddress1','PropertyAddress2','AssessedValue','PlaintiffMaxBid'] )
df2 = df2.append(df)
print(df)
return(df2)
Here is the call
df = parseForclosure(source)
Here is a snippet of what the html look like
<table class="ad_tab" tabindex="0"><tbody><tr><th class="AD_LBL" scope="row">Auction Type:</th><td class="AD_DTA">FORECLOSURE</td></tr><tr><th aria-label="Case Number" class="AD_LBL" scope="row">Case #:</th><td class="AD_DTA"><a href="/index.cfm?zaction=auction&zmethod=details&AID=103757&bypassPage=1">07009032CA01</a></td></tr><tr><th class="AD_LBL" scope="row">Final Judgment Amount:</th><td class="AD_DTA">$323,248.61</td></tr><tr><th class="AD_LBL" scope="row">Parcel ID:</th><td class="AD_DTA">30-6901-001-2470</td></tr><tr><th class="AD_LBL" scope="row">Property Address:</th><td class="AD_DTA">12260 SW 191 ST</td></tr><tr><th class="AD_LBL" scope="row"></th><td class="AD_DTA">MIAMI, FL- 33177</td></tr> <tr><th class="AD_LBL" scope="row">Assessed Value:</th><td class="AD_DTA">$184,791.00</td></tr><tr><th class="AD_LBL" scope="row">Plaintiff Max Bid:</th><td class="AD_DTA ASTAT_MSGPB">Hidden</td></tr></tbody></table>
You can see sample of all the tables in the link below.
https://projectcodesamples.s3.amazonaws.com/AuctionSample.html
My objective is place data points into a dataframe
Sample file with missing data points:
This is a sample file with all datapoints
Sample_file_with_no_missing_data_points
Ideally I should be able to extract from both without the dataframe size changing
Solution
Let's say that you have three HTML files with the data you provided since you first posted your question:
- Source.html
- Source2.html
- Source3.html
I have used this updated code to combine all the data in one dataframe:
import io
import csv
from bs4 import BeautifulSoup
import pandas as pd
input_files_names = [
'Source.html',
'Source2.html',
'Source3.html'
]
def setup_dataframes(files_names):
for current_file_name in files_names:
with open(current_file_name) as source_file:
soup = BeautifulSoup(source_file, 'html.parser')
field_labels = {
'AuctionType': 'Auction Type:',
'CaseNo': 'Case #:',
'JudgementAmount': 'Final Judgment Amount:',
'ParcelID': 'Parcel ID:',
'AssessedValue': 'Assessed Value:',
'PlaintiffMaxBid': "Plaintiff Max Bid:"
}
column_names = (
'AuctionType',
'CaseNo',
'JudgementAmount',
'ParcelID',
'PropertyAddress1',
'PropertyAddress2',
'AssessedValue',
'PlaintiffMaxBid'
)
def extract_data(soup):
for current_table in soup.find_all('table', class_='ad_tab'):
current_auction = {}
for (current_field, current_labal) in field_labels.items():
current_field_cell = current_table.tbody.find('th', string=current_labal)
if current_field_cell is not None:
current_data_cell = current_field_cell.next_sibling
current_auction[current_field] = current_data_cell.get_text()
address_row = current_table.tbody.find('th', string='Property Address:')
if address_row is not None:
current_auction['PropertyAddress1'] = address_row.find_next_sibling('td').get_text()
address2_row = address_row.parent.next_sibling.td
if address2_row is not None:
current_auction['PropertyAddress2'] = address2_row.get_text()
yield tuple(current_auction.get(current_field, '') for current_field in column_names)
with io.StringIO() as intermediate_data:
intermediate_csv = csv.writer(intermediate_data)
intermediate_csv.writerows(extract_data(soup))
intermediate_data.seek(0, 0)
df = pd.read_csv(intermediate_data, header=None, names=column_names)
yield df
df_composite = pd.concat(setup_dataframes(input_files_names), ignore_index=True)
print(df_composite)
What has been done here is:
- Extracting the text from the source HTML file by finding each file before creating a output row
- Creating a temporary, in memory, CSV file using io.StringIO and the csv module
- Creating a Pandas dataframe from that CSV file using pd.read_csv()
If you are processing a lot of data you may consider writing to a real file instead instead of using an in-memory file.
Answered By - EvensF
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.