1.数据筛选之BS4模块
例1:获取红牛分公司信息:http://www.redbull.com.cn/about/branch [规律比较统一]
# 获取红牛分公司数据http://www.redbull.com.cn/about/branch import requests from bs4 import BeautifulSoup import re import pandas import openpyxl res = requests.get('http://www.redbull.com.cn/about/branch') # 先测试是否需要加其余条件 # print(res.text) """ 公司名称 公司地址 公司邮箱 公司电话 <h2>红牛杭州分公司</h2> <p class='mapIco'>杭州市上城区庆春路29号远洋大厦11楼A座</p> <p class='mailIco'>310009</p> <p class='telIco'>0571-87045279/7792</p> """ # 方式1 # 正则方式 # title_list = re.findall('<h2>(.*?)</h2>', res.text) # addr_list = re.findall("<p class='mapIco'>(.*?)</p>", res.text) # email_list = re.findall("<p class='mailIco'>(.*?)</p>", res.text) # phone_list = re.findall("<p class='telIco'>(.*?)</p>", res.text) # print(phone_list) # 一一对应 # # 1.构造字典数据 # data_dict = { # "公司名称": title_list, # "公司地址": addr_list, # "公司邮箱": email_list, # "公司电话": phone_list # } # df = pandas.DataFrame(data_dict) # df.to_excel(r'company.xlsx') # 方式2 采用bs方式 soup = BeautifulSoup(res.text, 'lxml') # title_list = soup.find_all(name='h2') # for title in title_list: # print(title.text) # 列表生成式 title_list = [title.text for title in soup.find_all(name='h2')] # print(title_list) # addr_list = soup.find_all(name='p',class_='mapIco') # for addr in addr_list: # print(addr.text) addr_list = [addr.text for addr in soup.find_all(name='p', class_='mapIco')] email_list = [email.text for email in soup.find_all(name='p', class_='mailIco')] phone_list = [phone.text for phone in soup.find_all(name='p', class_='telIco')] print(len(title_list)) for i in range(40): print(""" "公司名称": %s, "公司地址": %s, "公司邮箱": %s, "公司电话": %s """ %(title_list[i],addr_list[i],email_list[i],phone_list[i]) )
例2:爬取链家数据(数据处理)
import requests from bs4 import BeautifulSoup """ 1.研究url规律 https://sh.lianjia.com/ershoufang/huangpu/ https://sh.lianjia.com/ershoufang/pudong/ https://城市首字母缩写.lianjia.com/房屋类型/区域名称/ 2.上海浦东区二手房 尝试着发送请求 第一种:先拿存储房屋数据的li标签 第二种:直接查找对应的标签数据 """ res = requests.get('https://sh.lianjia.com/ershoufang/pudong/') # print(res.text) soup = BeautifulSoup(res.text, 'lxml') # 研究url规律,筛选数据 div_list = soup.find_all(name='div', class_='info') title_list = [div.find(name='a').text for div in div_list if div.find(name='a')] link_list = [div.find(name='a').get('href') for div in div_list if div.find(name='a')] div1_list = soup.find_all(name='div', attrs={"class": 'positionInfo'}) addr_list = [div1.text for div1 in div1_list] # addr_list = [div1.find('a').text for div1 in div1_list] # print(addr_list) # for address in addr_list: # res = address.split('-') # print(res) # addr_list1 = [div1.find_all('a')[1].text for div1 in div1_list] # print(addr_list1) div2_list = soup.find_all(name='div',attrs={"class":"houseInfo"}) info_list = [ div2.text for div2 in div2_list ] """ '1室1厅 | 59平米 | 南 | 精装 | 中楼层(共14层) | 2010年建 | 板楼' 户型 面积 朝向 装修 楼层 年代 楼型 """ hx = [ i.split('|')[0].strip() for i in info_list] mj = [i.split('|')[1].strip() for i in info_list] cx = [i.split('|')[2].strip() for i in info_list] zx = [i.split('|')[3].strip() for i in info_list] lc = [i.split('|')[4].strip() for i in info_list] nd = [i.split('|')[5].strip() for i in info_list] lx = [i.split('|')[-1].strip() for i in info_list] div3_list = soup.find_all(name='div',attrs={"class":"followInfo"}) gz = [ div3.text for div3 in div3_list ] div4_list = soup.find_all(name='div',attrs={"class":"totalPrice"}) total_price = [ div4.text for div4 in div4_list ] div5_list = soup.find_all(name='div',attrs={"class":"unitPrice"}) unit = [ div5.text for div5 in div5_list ] """效果""" import pandas as pd data_dict = { "名称":title_list, "地址": addr_list, "户型":hx, "面积":mj, "朝向":cx, "装修":zx, "楼层":lc, "年代":nd, "楼型":lx, "总价":total_price, "单价":unit } df = pd.DataFrame(data_dict) df.to_excel(r'链家.xlsx') # 多页规律 你只需要研究url特点即可(绝对有规律) 第一页:https://sh.lianjia.com/ershoufang/jingan/ 第二页:https://sh.lianjia.com/ershoufang/jingan/pg2/ 第三页:https://sh.lianjia.com/ershoufang/jingan/pg3/ ... https://sh.lianjia.com/ershoufang/jingan/pgN/ '''第一页应该可以写成 https://sh.lianjia.com/ershoufang/jingan/pg1/ ''' for i in range(1,100): base_url = "https://sh.lianjia.com/ershoufang/jingan/pg%s/" print(base_url%i)
例3:爬取天气数据:网站数据不是一次性获取
""" 有时候网站的数据不是一次性加载的,内部可能是通过js动态请求 http://tianqi.2345.com/wea_history/58362.htm 有些网站内容编码查看需要在线json格式化 通过network检查找内部api接口 虹口 http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=11 http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12 http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2021&date%5Bmonth%5D=1 http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=区域&areaInfo%5BareaType%5D=2&date%5Byear%5D=年份&date%5Bmonth%5D=月份 """ import requests import pandas as pd res = requests.get("http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12") json_dict = res.json() data = json_dict.get('data') # 直接获取网页table标签内部所有的数据 res = pd.read_html(data) res[0].to_excel(r'weather.xlsx')
例4:爬取汽车之家新闻:排除干扰
import requests from bs4 import BeautifulSoup res = requests.get("https://www.autohome.com.cn/news/") res.encoding = 'GBK' soup = BeautifulSoup(res.text,'lxml') ul_ele = soup.find(name='ul',class_="article") li_list = ul_ele.find_all('li') # print(li_list) title_list=[] link_list=[] info_list=[] time_list=[] num_list=[] for li in li_list: if li.find('a'): # 其中有干扰项:<li id="ad_tw_04" style="display: none;"></li>,所以需要if判断 link = li.find('a')['href'] # print('https:'+link) link_list.append('https:'+link) # 新闻标题 h3 if li.find('h3'): title = li.find('h3').text title_list.append(title) if li.find('p'): info =li.find('p').text info_list.append(info) # if li.find('span'): # tm = li.find('span').text # time_list.append(tm) if li.select('span.fn-left'): tm = li.select('span.fn-left')[0].text # print(tm) if li.select('span.fn-right'): num = li.select('span.fn-right')[0].find('em').text #评论数是通过计算动态变化的,默认为0,通过js文件找 # comment = li.select('span.fn-right')[0].find_all('em') # print(comment)
例5:基于openpyxl爬取豆瓣数据
# 爬取豆瓣电影top250数据 1.先尝试着爬取一页 2.再去研究多页 https://movie.douban.com/top250 https://movie.douban.com/top250?start=25&filter= https://movie.douban.com/top250?start=50&filter= ... # 推导第一页 https://movie.douban.com/top250?start=0&filter= import requests from openpyxl import Workbook from bs4 import BeautifulSoup import time wb = Workbook() w1 = wb.create_sheet('电影排行榜',index=0) # 制作表头字段 w1['A1'] = '序号' w1['B1'] = '名称' w1['C1'] = '连接' w1['D1'] = '评分' w1['E1'] = '人数' # 提前定义一个序号字段 count = 1 for i in range(0,250,25): base_url = 'https://movie.douban.com/top250?start=%s&filter=' url = base_url%i res = requests.get(url, # 携带请求头 headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36" } ) soup = BeautifulSoup(res.text,'lxml') ol = soup.find(name='ol',class_='grid_view') li_list = ol.find_all(name='li') for li in li_list: count += 1 title = li.find(name='span').text link = li.find(name='a').get('href') num = li.select('.rating_num')[0].text comment = li.find(name='div',class_='star').find_all('span')[-1].text # 写入数据 w1['A%s'%count] = count - 1 w1['B%s'%count] = title w1['C%s'%count] = link w1['D%s'%count] = num w1['E%s'%count] = comment # 人为的设置间歇 避免IP封禁 time.sleep(5) wb.save(r'movie.xlsx') """上述代码还可以封装成函数 和 启动脚本的形式 def get_data(url): ... if __name__ == '__main__': for i in range(0,250,25): base_url = 'https://movie.douban.com/top250?start=%s&filter=' url = base_url%i get_data(url) """
总结
1.先尝试爬取一页数据甚至是几条数据
2.代码逻辑跑通了之后采取考虑多页的情况