1.jupyter的快捷键
插入cell: a(向上) b(向下) 删除: x 执行 : shift+enter tab:补全 cell模式切换:y,m shift+tab:打开帮助文档
2.requests
requests模块编写代码的流程
1指定url
2发起请求
3获取相应对象中的数据
4数据解析
5持久化存储
## 爬取搜狗首页 #1 url = 'https://www.sogou.com/' #2 response = requests.get(url=url) #3 page_text = response.text #4 with open('./sougou.html','w',encoding='utf-8') as f: f.write(page_text)
1 #需求:爬取搜狗指定词条搜索后的页面数据 2 import requests 3 4 url = 'https://www.sogou.com/web' 5 # 封装参数 6 wd = input('enter a word:') 7 parma = { 8 'query':wd 9 } 10 response = requests.get(url=url,params=parma) 11 12 page_text = response.content 13 14 filename = wd + '.html' 15 16 with open(filename,'wb') as f: 17 f.write(page_text) 18 print('over')
1 # #爬取百度翻译结果 2 import requests 3 url = 'https://fanyi.baidu.com/sug' 4 wd = input('enter a word') 5 data = { 6 'kw':wd 7 } 8 9 reponse = requests.post(url=url,data=data) 10 11 print(reponse.json()) 12 # text:字符串 13 # content: 二进制 14 # json():对象
1 # 爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据 2 import requests 3 url = 'https://movie.douban.com/j/chart/top_list' 4 parma = { 5 "type": "11", 6 "interval_id": "100:90", 7 "action": "", 8 "start": "0", 9 "limit": "20", 10 } 11 movie_data = requests.get(url=url,params=parma).json() 12 print(movie_data)
1 #需求:爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据http://125.35.6.84:81/xk/ 2 import requests 3 url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' 4 headers = { 5 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 6 } 7 id_list = [] 8 for page in range(1,2): 9 data = { 10 "on": "true", 11 "page": str(page), 12 "pageSize": "15", 13 "productName":"", 14 "conditionType": "1", 15 "applyname":"", 16 "applysn": "", 17 } 18 json_data = requests.post(url=url,data=data,headers=headers).json() 19 for dic in json_data["list"]: 20 id = dic["ID"] 21 id_list.append(id) 22 23 detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' 24 for id in id_list: 25 detail_data = { 26 "id":id 27 } 28 detail_data = requests.post(url=detail_url,data=detail_data,headers=headers).json() 29 print(detail_data)
1 # 方式一 2 import requests 3 url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551194867510&di=bbf61d08b5497fa04a519267c3efb3ee&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fitem%2F201402%2F09%2F20140209170955_AiTUh.thumb.700_0.jpeg' 4 img_data = requests.get(url=url,headers=headers).content 5 with open('./bingzhang.jpg','wb') as f: 6 f.write(img_data) 7 8 # 方式二 9 import urllib 10 url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551194867510&di=bbf61d08b5497fa04a519267c3efb3ee&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fitem%2F201402%2F09%2F20140209170955_AiTUh.thumb.700_0.jpeg' 11 urllib.request.urlretrieve(url=url,filename='./liweier.jpg')
3.使用正则解析数据
简单使用
import re string = '''fall in love with you i love you very much i love she i love her''' re.findall('^i.*',string,re.M) # M是一行一行 #匹配全部行 string1 = """细思极恐 你的队友在看书 你的敌人在磨刀 你的闺蜜在减肥 隔壁老王在练腰 """ re.findall('.*',string1,re.S) # S全部
案例
1 import re 2 import os 3 import requests 4 import urllib 5 6 url = 'https://www.qiushibaike.com/pic/page/%s/?s=5170618' 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 9 } 10 11 if not os.path.exists('./qiutu'): 12 os.mkdir('./qiutu') 13 14 start_page = int(input('enter a start pageNum:')) 15 end_page = int(input('enter a end pageNum:')) 16 17 for page in range(start_page, end_page + 1): 18 new_url = format(url%page) 19 page_text = requests.get(url=new_url,headers=headers).text 20 img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>',page_text,re.S) 21 for img_url in img_url_list: 22 img_url = 'https:' + img_url 23 imgName = img_url.split('/')[-1] 24 imgPath = 'qiutu/' + imgName 25 urllib.request.urlretrieve(url=img_url,filename=imgPath) 26 print(imgName,"下载成功") 27 print("over!!!")
4.bs4接续数据
先安装:
pip install bs4
pip install lxml
解析原理:
1.将即将要进行解析的源码加载到bs对象
2.调用bs对象中相关的方法或着属性进行源码中的相关标签的定位 find(‘name’,class_="xxx") findall() select()
3.将定位到的标签之间存在的文本或者属性值获取 string text get_text() a['href']
案例:
1 import requests 2 from bs4 import BeautifulSoup 3 4 url = 'http://www.shicimingju.com/book/sanguoyanyi.html' 5 headers = { 6 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 7 } 8 9 page_text = requests.get(url=url,headers=headers).text 10 11 soup = BeautifulSoup(page_text,'lxml') 12 13 a_list = soup.select('.book-mulu > ul > li > a') 14 15 fp = open('sangou.txt','w',encoding='utf-8') 16 17 for a in a_list: 18 title = a.string 19 detail_url = 'http://www.shicimingju.com' + a["href"] 20 21 detail_page_text = requests.get(url=detail_url,headers=headers).text 22 23 soup = BeautifulSoup(detail_page_text,'lxml') 24 content = soup.find('div',class_='chapter_content').text 25 26 fp.write(title + ' ' + content) 27 print(title,"x下载成功") 28 print("over!") 29 fp.close()