发布于2023-06-20 21:39 阅读(3674) 评论(0) 点赞(29) 收藏(2)
这篇文章记录以下两点:
1.获取天气信息的方法和步骤以及遇到的问题和改进方法
2.获取到天气信息后进行数据清洗和可视化展示
总的来说,就是将网站中的天气信息通过爬虫技术保存在文件中,再通过对文件中的文本数据进行处理后用图表方式展现出来。
# coding:utf-8 import requests def get_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" } r = requests.get(url=url,headers=headers) if r.status_code == 200: print('请求成功') else: print('请求失败') URL = 'http://www.weather.com.cn/weather7d/101270101.shtml' get_data(URL)
我们发现天气信息是保存在网页源代码里,只需要获取网页源代码后进行解析即可获取到数据
# coding:utf-8 import requests def get_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" } r = requests.get(url=url, headers=headers) if r.status_code == 200: # 设置编码格式 r.encoding = 'UTF-8' # 通过text方法返回网页源码 return r.text else: return '请求失败' URL = 'http://www.weather.com.cn/weather7d/101270101.shtml' print(get_data(URL))
URL = 'http://www.weather.com.cn/weather/101270101.shtml' # 调用函数获取网页源代码 html_code = get_data(URL) soup = BeautifulSoup(html_code, "html.parser") div = soup.find("div", id="7d") # 获取div标签,下面这种方式也可以 # div = soup.find('div', attrs={'id': '7d', 'class': 'c7d'}) # div ul = div.find("ul") # ul lis = ul.find_all("li") # li # 此行为该网站更新信息时间 # print(soup.find("div", id='around').find("h1").find("i").text) li_today = lis[0] # 发现在晚上访问该网站,今日的天气是没有最高气温,需要手动添加,无法遍历添加 weather_list = [] weather = [] # 添加今天的数据 date_today = li_today.find('h1').text # 日期 wea_today = li_today.find('p', class_="wea").text # 天气 tem_h_today = 'NONE' tem_l_today = li_today.find('p', class_="tem").find("i").text # 温度最低 spans_today = li_today.find('p', attrs={"class": "win"}).find_all("span") win1_today = '' # 风向 for s in spans_today: win1_today += s.get('title') + '且' win2_today = li_today.find('p', attrs={"class": "win"}).find("i").text # 风力 weather_today = [date_today, wea_today, tem_h_today, tem_l_today, win1_today + win2_today] weather_all = [] # 添加剩下6天的数据 for li in lis[1:]: date = li.find('h1').text # 日期 wea = li.find('p', class_="wea").text # 天气 tem_h = li.find('p', class_="tem").find("span").text # 温度最高 tem_l = li.find('p', class_="tem").find("i").text # 温度最低 spans = li.find('p', attrs={"class": "win"}).find("span") # 此处不需要find_all win1 = spans.get('title') + '且' # 风向 win2 = li.find('p', attrs={"class": "win"}).find("i").text # 风力 weather = [date, wea, tem_h, tem_l, win1 + win2] weather_all.append(weather) # 插入首天数据 weather_all.insert(0, weather_today) print(weather_all)
# coding:utf-8 import requests def get_data(web_url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" } r = requests.get(url=web_url, headers=headers) if r.status_code == 200: # 返回响应对象中JSON解码的数据内容 weather_data = r.json() return weather_data else: return '请求失败!' url = 'http://d1.weather.com.cn/calendar_new/2022/101270101_202207.html' data = get_data(url) print(data)
# coding:utf-8 import requests import random def get_data(web_url): my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 " ] random_header = random.choice(my_headers) headers = { 'User-Agent': random_header } r = requests.get(url=web_url, headers=headers) if r.status_code == 200: # 返回响应对象中JSON解码的数据内容 html_data = r.json() return html_data else: return '爬取失败!' url = 'http://d1.weather.com.cn/calendar_new/2022/101270101_202207.html' data = get_data(url) print(data)
# coding:utf-8 import requests import random import json def get_data(web_url): my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 " ] random_header = random.choice(my_headers) # 获取随机headers headers = { "Referer": "http://www.weather.com.cn/", 'User-Agent': random_header } r = requests.get(url=web_url, headers=headers) if r.status_code == 200: content = r.content.decode(encoding='utf-8') # 此json文件中前面有变量名,剔除变量名,只要后面的数组数据 weathers = json.loads(content[11:]) return weathers else: return '爬取失败!' url = 'http://d1.weather.com.cn/calendar_new/2022/101270101_202207.html' data = get_data(url) print(data)
def get_y_m_url(): # 定义列表url_list url_list = [] # 使用format功能构造每月数据的url for month_2022 in range(1, 7): url_2022 = 'http://d1.weather.com.cn/calendar_new/2022/101270101_20220{}.html'.format(month_2022) # 保存多月数据的url到列表url_list中 url_list.append(url_2022) return url_list url_list_all = get_y_m_url() # for循环遍历列表url_list for url in url_list_all: # 调用函数get_data获取每月数据 weather_data = get_data(url) # 打印输出每月数据 print(weather_data)
# 创建空列表保存天气数据列表 weather_info = [] url = 'http://d1.weather.com.cn/calendar_new/2022/101110801_202206.html' # 调用函数进行数据获取 weather_data = get_data(url) for every_day_weather in weather_data: # 日期 date = every_day_weather['date'] # 降雨概率 rainfall_probability = every_day_weather['hgl'] # 最高温 tem_max = every_day_weather['hmax'] # 最低温 tem_min = every_day_weather['hmin'] # 将以上四个数据保存在字典里,为一天的数据 one_day_weahther = {'date': date, 'rainfall_probability': rainfall_probability, 'tem_max': tem_max,'tem_min': tem_min} # 将每天的数据保存在列表里 weather_info.append(one_day_weahther) print(weather_info)
# 创建空列表保存天气数据列表 weather_info = [] # for循环遍历列表url_list for url in url_list_all: # 调用函数get_data获取每月数据 weather_data = get_data(url) for every_day_weather in weather_data: # 日期 date = every_day_weather['date'] # 降雨概率 rainfall_probability = every_day_weather['hgl'] # 最高温 tem_max = every_day_weather['hmax'] # 最低温 tem_min = every_day_weather['hmin'] # 将以上四个数据保存在字典里,为一天的数据 one_day_weahther = {'date': date, 'rainfall_probability': rainfall_probability, 'tem_max': tem_max,'tem_min': tem_min} # 将每天的数据保存在列表里,同时去重 if one_day_weahther not in weather_info: weather_info.append(one_day_weahther)
# 保存天气数据到CSV文件 def save_csv(weather_data): # 打开文件 csv_file = open('weather_info.csv', 'w', encoding='UTF-8-SIG', newline='\n') # 设置表头信息fieldnames=['date', 'rainfall_probability', 'tem_max', 'tem_min'] fieldnames = ['date', 'rainfall_probability', 'tem_max', 'tem_min'] # 创建DictWriter对象,并返回给变量dict_writer dict_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) # 使用writeheader功能写入表头信息 dict_writer.writeheader() # 使用writerows功能写入多行数据 dict_writer.writerows(weather_data) # 关闭文件 csv_file.close() save_csv(weather_info_final)
line = ( Line( init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_duration=5000), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都——深圳平均天气对比图', # 设置主题 theme=ThemeType.MACARONS ) ) .add_xaxis(xaxis_data=x) .add_yaxis(series_name="成都", y_axis=y_cd, is_smooth=True) .add_yaxis(series_name="深圳", y_axis=y_sz, is_smooth=True) .set_global_opts(title_opts=opts.TitleOpts(title="成都——深圳平均温度对比图"), xaxis_opts=opts.AxisOpts(name='年月'), yaxis_opts=opts.AxisOpts(name='温度 单位:℃'), ) .render('compare_average_tem.html') )
import csv import pyecharts.options as opts from pyecharts.charts import Line import numpy line = ( Line( init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_duration=5000), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都——深圳最高温度对比图', theme=ThemeType.ROMANTIC ) ) .add_xaxis(xaxis_data=x) .add_yaxis(series_name="成都", y_axis=y_cd, is_smooth=True) .add_yaxis(series_name="深圳", y_axis=y_sz, is_smooth=True) .set_global_opts(title_opts=opts.TitleOpts(title="成都——深圳最高温度对比图"), xaxis_opts=opts.AxisOpts(name='年月'), yaxis_opts=opts.AxisOpts(name='温度 单位:℃'), ) .render('compare_max_tem.html') )
import csv import pyecharts.options as opts from pyecharts.charts import Line import numpy line = ( Line( init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_duration=5000), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都——深圳最低温度对比图', theme=ThemeType.WESTEROS ) ) .add_xaxis(xaxis_data=x) .add_yaxis(series_name="成都", y_axis=y_cd, is_smooth=True) .add_yaxis(series_name="深圳", y_axis=y_sz, is_smooth=True) .set_global_opts(title_opts=opts.TitleOpts(title="成都——深圳最低温度对比图"), xaxis_opts=opts.AxisOpts(name='年月'), yaxis_opts=opts.AxisOpts(name='温度 单位:℃'), ) .render('compare_min_tem.html') )
import csv from pyecharts.globals import ThemeType import pyecharts.options as opts from pyecharts.charts import Bar import pandas as pd bar = ( Bar( # 设置果冻特效动画 init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_delay=500, animation_easing="elasticOut"), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都——深圳温度区间天数图', theme=ThemeType.INFOGRAPHIC ) ) .add_xaxis(xaxis_data=x) .add_yaxis(series_name="成都", y_axis=cd_max_count) .add_yaxis(series_name="深圳", y_axis=sz_max_count) .set_global_opts(title_opts=opts.TitleOpts(title="成都——深圳温度区间天数图"), xaxis_opts=opts.AxisOpts(name='温度区间'), yaxis_opts=opts.AxisOpts(name='天数 单位:天'), ) .render('compare_tem_count.html') )
pie = ( Pie( # 设置果冻特效动画 init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_delay=500, animation_easing="elasticOut"), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都半年每日最高温度占比', # theme=ThemeType.INFOGRAPHIC ) ) .add('成都180天高温温度占比', list(zip(attr_tem_interval, cd_max_count)), ) .set_global_opts(title_opts=opts.TitleOpts(title="成都半年每日最高温度占比"), legend_opts=opts.LegendOpts(pos_left='center' , pos_bottom='bottom' , orient="horizontal" ) ) # a:系列名称(标题),b:数据项名称,c:数值,d:百分比 .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c}天({d}%)')) # 设置每块区域的颜色 .set_colors(['#00FFFF', '#00BFFF', '#FFD700', '#FFA500', '#FF0000']) .render('cd_tem_pie.html') )
pie = ( Pie( # 设置果冻特效动画 init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_delay=500, animation_easing="elasticOut"), bg_color='rgba(255,250,205,0.2)', width='1000px', height='600px', page_title='成都半年每日最高温度占比——南丁格尔图', # theme=ThemeType.INFOGRAPHIC ) ) .add('成都180天高温温度占比', list(zip(attr_tem_interval, cd_max_count)), # 是否展示成南丁格尔图,通过半径区分数据大小。可选择两种模式: # 'radius' 扇区圆心角展现数据的百分比,半径展现数据的大小。 # 'area' 所有扇区圆心角相同,仅通过半径展现数据大小。 rosetype="radius", # 饼图的半径,数组的第一项是内半径,第二项是外半径(如果两项均设置则为环状图) # 默认设置成百分比,相对于容器高宽中较小的一项的一半 radius="55%", # 饼图的中心(圆心)坐标,数组的第一项是横坐标,第二项是纵坐标 # 默认设置成百分比,设置成百分比时第一项是相对于容器宽度,第二项是相对于容器高度 center=["50%", "50%"], ) .set_global_opts(title_opts=opts.TitleOpts(title="成都半年每日最高温度占比——南丁格尔图"), legend_opts=opts.LegendOpts(pos_left='center' , pos_bottom='bottom' , orient="horizontal" ) ) # a:系列名称(标题),b:数据项名称,c:数值,d:百分比 .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c}天({d}%)')) # 设置每块区域的颜色 .set_colors(['#00FFFF', '#00BFFF', '#FFD700', '#FFA500', '#FF0000']) .render('cd_tem_pie_coxcomb.html') )
若本篇内容对您有所帮助,请三连点赞,关注,收藏支持下。
创作不易,白嫖不好,各位的支持和认可,就是我创作的最大动力,我们下篇文章见!
圈圈仔OvO | 文
如果本篇博客有任何错误,请批评指教,不胜感激 !
原文链接:https://blog.csdn.net/m0_47258632/article/details/125982902
作者:php码农的美好生活
链接:http://www.phpheidong.com/blog/article/546637/8eefea6cbca0446c49ca/
来源:php黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 php黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-4
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!