justkidding036のブログ

厚労省のページからコロナ感染者と死亡者数を
週毎に集計してxlsxに書き込んだ


import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import unicodedata
import pandas as pd
from tqdm import tqdm

# 令和to西暦
def reiw_to_dt(d):
    d_han = unicodedata.normalize('NFKC', d)[2:]
    d_w = str(int(d_han[:1]) + 2018) + d_han[1:]
    return datetime.strptime(d_w, '%Y年%m月%d日').strftime('%y/%m/%d/%a') 
#文字列日付から一日減算した文字列日付
def str_day_sub1(d):
    dd =datetime.strptime(d, '%y/%m/%d/%a') - relativedelta(days=1)
    return dd.strftime('%y/%m/%d/%a')

yyyymms = []
for m in reversed(range(18)):
    day = datetime.now() - relativedelta(months=m)
    yyyymm = str(day.year) + format(day.month ,'02') 
    yyyymms.append(yyyymm)

urls = ['https://www.mhlw.go.jp/stf/houdou/houdou_list_' + ym + '.html' for ym in yyyymms]

hrefss = []
for u in urls:
    soup = BeautifulSoup(requests.get(u).content, 'html.parser')
    #新型コロナウイルス感染症の現在の状況と厚生労働省の対応について（令和3年4月30日版）
    hrefs = [[a.get('href') , reiw_to_dt(re.findall("(?<=\（).+?(?=\）)", a.text)[0][:-1])]    
        for a in reversed(soup.find_all('a')) if '新型コロナウイルス感染症の現在' in a.text]
    hrefss += hrefs
hrefs_mon = [h for h in hrefss if h[1][-3:] == 'Mon']
end = hrefss[-1]#最新日データ
hrefs_mon.append(end)

day_kanns_sibou = []
for h in tqdm(hrefs_mon):
    url = 'https://www.mhlw.go.jp/' + h[0]
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    p_text = [p.text for p in soup.find_all('p') if '確認されてい' in p.text][0]
    kanns = re.findall("(?<=感染者は).+?(?=名)", p_text)[0].replace(",","")
    sibou = re.findall("(?<=死亡者は).+?(?=名)", p_text)[0].replace(",","")
    k = re.search(r'\d+', kanns) # 感染者数の後に余計な文字が付く事あるのでre.search関数で最初にマッチするものだけを返送する
    kanns = k.group()
    day_kanns_sibou.append([h[1] , int(kanns),int(sibou)])

temp = day_kanns_sibou.copy()
temp.remove(temp[0])
st_en_kanns_sibou = [[d[0] , str_day_sub1(t[0]), t[1] - d[1] , t[2] - d[2]]
                     for d , t in zip(day_kanns_sibou , temp)]

df = pd.DataFrame(st_en_kanns_sibou )
xlsx_path = r"C:\Users\dazai\OneDrive\デスクトップ\corona.xlsx"
df.to_excel(xlsx_path , header=False, index=None)

import openpyxl
# ブックを取得
book = openpyxl.load_workbook(xlsx_path)
# シートを取得
sheet = book['Sheet1']
# 列の幅を変更
sheet.column_dimensions['A'].width = 15
sheet.column_dimensions['B'].width = 15
#複数列指定がわからない
# 保存する
book.save(xlsx_path)

justkidding036のブログ

コロナ感染者と死亡者数を週毎に集計してxlsxに書き込んだ

pythonでJRAの「過去レース結果検索」のアドレスを作る