python - Beautiful soup : extract weather info : table --> Excel file -
i'd extract data table of website
weather history table
what need
- extract contents of
<td>
in<table class="responsive">
\<tbody>
\<tr>
- generate excel file python whole data.
what don't need
- all units next numbers (ie : 22°c) (i don't want
<span>
in<td>
)
<td class="data-cell alt-cell">33.8 <span class="table-unit">°c</span></td>
- exclude repetiting headers in
<tbody>
\<tr class="column-heading">
,<tr class="row-subheading">
can show , explain how extract these data in excel file?
html code
<table id="history_table" class="responsive"> <thead> <tr class="column-heading"> <th class="year-cell">2016</th> <th colspan="3">temperature</th> <th colspan="3">dew point</th> <th colspan="3">humidity</th> <th colspan="3">speed</th> <th colspan="3">pressure</th> <th>precip. accum.</th> </tr> <tr class="row-subheading"><th>sep</th> <th class="alt-cell">high</th> <th class="alt-cell">avg</th> <th class="alt-cell">low</th> <th>high</th> <th>avg</th> <th>low</th> <th class="alt-cell">high</th> <th class="alt-cell">avg</th> <th class="alt-cell">low</th> <th>high</th> <th>avg</th> <th>gust</th> <th class="alt-cell">high</th> <th class="alt-cell">avg</th> <th class="alt-cell">low</th> <th>sum</th> </tr> </thead> <tbody> <tr> <td class="data-cell">12</td> <td class="data-cell alt-cell">33.8 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">26.1 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">18.4 <span class="table-unit">°c</span></td> <td class="data-cell">17.6 <span class="table-unit">°c</span></td> <td class="data-cell">16 <span class="table-unit">°c</span></td> <td class="data-cell">13.4 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">88 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">55 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">30 <span class="table-unit">%</span></td> <td class="data-cell">12 <span class="table-unit">kph</span></td> <td class="data-cell">1 <span class="table-unit">kph</span></td> <td class="data-cell">16 <span class="table-unit">kph</span></td> <td class="data-cell alt-cell">1016 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1014 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1012 <span class="table-unit">hpa</span></td> <td class="data-cell">0 <span class="table-unit">mm</span></td> </tr> <tr> <td class="data-cell">13</td> <td class="data-cell alt-cell">34.2 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">29 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">23.8 <span class="table-unit">°c</span></td> <td class="data-cell">17.4 <span class="table-unit">°c</span></td> <td class="data-cell">15.6 <span class="table-unit">°c</span></td> <td class="data-cell">12.7 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">61 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">49 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">29 <span class="table-unit">%</span></td> <td class="data-cell">12 <span class="table-unit">kph</span></td> <td class="data-cell">3 <span class="table-unit">kph</span></td> <td class="data-cell">16 <span class="table-unit">kph</span></td> <td class="data-cell alt-cell">1013 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1010 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1008 <span class="table-unit">hpa</span></td> <td class="data-cell">0 <span class="table-unit">mm</span></td> </tr> <tr class="column-heading"> <td class="year-cell">2017</td> <td colspan="3">temperature</td> <td colspan="3">dew point</td> <td colspan="3">humidity</td> <td colspan="3">speed</td> <td colspan="3">pressure</td> <td>precip. accum.</td> </tr> <tr class="row-subheading"> <td>apr</td> <td class="alt-cell">high</td> <td class="alt-cell">avg</td> <td class="alt-cell">low</td> <td>high</td> <td>avg</td> <td>low</td> <td class="alt-cell">high</td> <td class="alt-cell">avg</td> <td class="alt-cell">low</td> <td>high</td> <td>avg</td> <td>gust</td> <td class="alt-cell">high</td> <td class="alt-cell">avg</td> <td class="alt-cell">low</td> <td>sum</td> </tr> <tr> <td class="data-cell">1</td> <td class="data-cell alt-cell">17.4 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">14.1 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">10.7 <span class="table-unit">°c</span></td> <td class="data-cell">10.2 <span class="table-unit">°c</span></td> <td class="data-cell">7.4 <span class="table-unit">°c</span></td> <td class="data-cell">4.7 <span class="table-unit">°c</span></td> <td class="data-cell alt-cell">82 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">68 <span class="table-unit">%</span></td> <td class="data-cell alt-cell">45 <span class="table-unit">%</span></td> <td class="data-cell">11 <span class="table-unit">kph</span></td> <td class="data-cell">5 <span class="table-unit">kph</span></td> <td class="data-cell">18 <span class="table-unit">kph</span></td> <td class="data-cell alt-cell">1016 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1015 <span class="table-unit">hpa</span></td> <td class="data-cell alt-cell">1013 <span class="table-unit">hpa</span></td> <td class="data-cell">0 <span class="table-unit">mm</span></td> </tr>...
python code
from xlsxwriter import workbook bs4 import beautifulsoup def read_file(): file = open('meteo.html', 'rt', encoding='utf8') data = file.read() file.close() return data data_path ='/users/xtro/dropbox/work/test/data/out/meteo' def write_data_to_excel_file(datas,data_path): #print(datas[8]) workbook=workbook(data_path +'/meteo.xlsx') worksheet = workbook.add_worksheet() row=0 worksheet.write(row,0,'date') worksheet.write(row,1,'température haute en °c') worksheet.write(row,2,'température moyenne en °c') worksheet.write(row,3,'température basse en °c') worksheet.write(row,4,'point de rosée haut en °c') worksheet.write(row,5,'point de rosée moyenne en °c') worksheet.write(row,6,'point de rosée bas') worksheet.write(row,7,'humidité haute en %') worksheet.write(row,8,'humidité moyenne en %') worksheet.write(row,9,'humidité basse en %') worksheet.write(row,10,'vitesse haute en km/h') worksheet.write(row,11,'raffale en km/h') worksheet.write(row,12,'pression haute en hpa') worksheet.write(row,13,'pression moyenne en hpa') worksheet.write(row,14,'pression basse en hpa') worksheet.write(row,15,'précipitation/jour en mm') row+=1 data in datas: print(data[1]) cellule0=data[1] worksheet.write(row,0,cellule0) cellule0=data[0] cellule1=data[1] cellule2=data[2] cellule3=data[3] cellule4=data[4] cellule5=data[5] cellule6=data[6] cellule7=data[7] cellule8=data[8] cellule9=data[9] cellule10=data[10] cellule11=data[11] cellule12=data[12] cellule13=data[13] cellule14=data[14] cellule15=data[15] #cellule[i]=data[i] worksheet.write(row,0,cellule0) worksheet.write(row,1,cellule1) worksheet.write(row,2,cellule2) worksheet.write(row,3,cellule3) worksheet.write(row,4,cellule4) worksheet.write(row,5,cellule5) worksheet.write(row,6,cellule6) worksheet.write(row,7,cellule7) worksheet.write(row,8,cellule8) worksheet.write(row,9,cellule9) worksheet.write(row,10,cellule10) worksheet.write(row,11,cellule11) worksheet.write(row,12,cellule12) worksheet.write(row,13,cellule13) worksheet.write(row,14,cellule14) worksheet.write(row,14,cellule15) row +=1 workbook.close() soup = beautifulsoup(read_file(),'lxml') data = [] table = soup.find('table',class_='responsive') table_body = table.find('tbody') rows = table_body.find_all('tr') tr in rows: spans = tr.find_all('span') #print(spans) if spans: continue #print (rows) row in rows: cols = row.find_all('td') #print (cols) cols = [ele.text.strip() ele in cols] data.append([ele ele in cols if ele]) write_data_to_excel_file(data,data_path)
here's alternative approach using selenium. forgot requirement eliminate headers.
>>> selenium import webdriver >>> driver = webdriver.chrome() >>> driver.get('https://www.wunderground.com/personal-weather-station/dashboard?id=iledefra210#history/tdata/s20160912/e20170912/mcustom') >>> rows = driver.find_elements_by_xpath('.//td[@class="data-cell"]/..') >>> len(rows) 366 >>> rows[-1].text '12 19.9 °c 16.3 °c 12.9 °c 14.3 °c 9.6 °c 6.8 °c 82 % 69 % 47 % 18 kph 3 kph 24 kph 1012 hpa 1009 hpa 1005 hpa 2 mm' >>> rows[0].text '12 33.8 °c 26.1 °c 18.4 °c 17.6 °c 16 °c 13.4 °c 88 % 55 % 30 % 12 kph 1 kph 16 kph 1016 hpa 1014 hpa 1012 hpa 0 mm' >>> r, row in enumerate(rows): ... [_.text.split()[0] _ in row.find_elements_by_xpath('.//td') ] ... ['12', '33.8', '26.1', '18.4', '17.6', '16', '13.4', '88', '55', '30', '12', '1', '16', '1016', '1014', '1012', '0'] ['13', '34.2', '29', '23.8', '17.4', '15.6', '12.7', '61', '49', '29', '12', '3', '16', '1013', '1010', '1008', '0'] ['14', '33.9', '26.6', '19.3', '18.4', '14.7', '12.4', '77', '52', '32', '16', '2', '20', '1013', '1010', '1007', '0.3'] ['15', '22.1', '19.5', '16.9', '18.3', '16.2', '13.1', '98', '87', '74', '13', '2', '16', '1014', '1011', '1009', '16.8'] ['16', '21.4', '18.3', '15.3', '16.5', '13.9', '12.4', '96', '80', '58', '13', '2', '20', '1015', '1013', '1012', '6.9'] ['17', '19.6', '16.4', '13.3', '14.1', '12.9', '10.9', '94', '84', '62', '5', '0', '8', '1019', '1017', '1015', '1'] ['18', '25.5', '18.9', '12.4', '17.7', '14.4', '11.8', '97', '84', '55', '4', '0', '8', '1020', '1018', '1017', '0.5'] ['19', '22.4', '18.8', '15.1', '17.6', '14.8', '13.7', '98', '84', '64', '5', '0', '8', '1021', '1020', '1019', '1.8'] ['20', '24.5', '20.1', '15.6', '15.1', '13.1', '10', '90', '71', '48', '2', '0', '4', '1020', '1018', '1017', '0'] ['21', '30', '20.5', '11', '14.2', '11.5', '9.7', '97', '63', '32', '2', '0', '4', '1020', '1018', '1017', '0'] ['22', '26.4', '19.7', '13', '13.3', '11.7', '9.6', '96', '63', '38', '11', '0', '16', '1021', '1020', '1019', '0'] ['23', '31', '21', '14.7', '15.4', '12.8', '10.9', '91', '62', '34', '8', '0', '12', '1026', '1023', '1021', '0'] ['24', '26', '20.1', '14.2', '13', '10.7', '8.3', '82', '57', '35', '21', '2', '24', '1024', '1020', '1016', '0'] ['25', '24.7', '17.9', '14.2', '15.9', '11.3', '8.9', '88', '66', '41', '12', '1', '16', '1023', '1019', '1014', '1'] ['26', '24.2', '16.4', '11.6', '11.5', '9.6', '6.4', '94', '66', '36', '12', '1', '20', '1024', '1023', '1022', '0'] ['27', '26.2', '19', '12.4', '11.2', '9.4', '7.7', '86', '56', '34', '10', '1', '16', '1028', '1025', '1023', '0'] ['28', '27.4', '20.6', '16.6', '17.1', '13.8', '9.3', '84', '66', '44', '14', '1', '20', '1028', '1027', '1025', '0'] ['29', '23.1', '18.4', '15.1', '15.5', '14.2', '12.5', '94', '77', '56', '17', '2', '24', '1025', '1021', '1016', '0'] ['30', '21.3', '16.6', '14.5', '13.4', '11.6', '7.8', '93', '73', '50', '10', '0', '12', '1017', '1014', '1012', '1.8'] ['1', '19.8', '15.6', '12.9', '13.7', '11.6', '9.4', '97', '78', '57', '18', '2', '24', '1012', '1010', '1009', '1'] ['2', '22.7', '14.6', '10.5', '11', '8.4', '6.5', '85', '67', '41', '14', '1', '16', '1022', '1017', '1011', '1'] ['3', '27.2', '16.1', '8.3', '11.1', '7.9', '5', '99', '64', '26', '12', '0', '16', '1028', '1025', '1022', '0'] ['4', '24.1', '15.9', '9.8', '12', '8.9', '6', '98', '67', '34', '14', '1', '20', '1028', '1027', '1026', '0'] ['5', '21.9', '14.6', '9.1', '9.4', '6.4', '2.5', '87', '61', '33', '19', '1', '28', '1026', '1025', '1023', '0'] ['6', '20.4', '12.3', '5.8', '6.6', '4.4', '2.4', '86', '61', '36', '14', '1', '20', '1024', '1020', '1017', '0'] ['7', '17.6', '12.5', '9.5', '10', '7.8', '6.1', '88', '74', '56', '10', '0', '16', '1019', '1018', '1017', '0']
Comments
Post a Comment