碰到乱码时解决方法
requests.get().text是根据HTML文件的headers中的编码来解码的, 出现乱码需要自己用content来获取信息然后解码
res = res.encode('iso-8859-1').decode('gbk') # 不知道用什么解码时, 就用这个, 一般html的header中有charset
html = r.content
html = str(html,'utf-8') #html_doc=html.decode("utf-8","ignore")
r = requests.get("http://www.baidu.com")
r.encoding='utf-8'
html=r.text
方法二(如果还是不行, 用这方法):
# -*-coding:utf8-*- import requests req = requests.get("http://news.sina.com.cn/") if req.encoding == 'ISO-8859-1': encodings = requests.utils.get_encodings_from_content(req.text) if encodings: encoding = encodings[0] else: encoding = req.apparent_encoding # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') global encode_content encode_content = req.content.decode(encoding, 'replace') #如果设置为replace,则会用?取代非法字符; print(encode_content) with open('test.html','w',encoding='utf-8') as f: f.write(encode_content) --------------------- 作者:chaowanghn 来源:CSDN 原文:https://blog.csdn.net/chaowanghn/article/details/54889835 版权声明:本文为博主原创文章,转载请附上博文链接!
Jupyter快捷键
- 插入cell: a b
- 删除: x
- 执行:shift+enter
- tab:
- cell模式切换: y(m->code) m(code->m)
- shift+tab:打开帮助文档
爬虫的分类:
- 通用爬虫:
- 聚焦爬虫:
- 增量式:
不需要with open写入文件的写法: urllib
import urllib urllib.request.urlretrieve(url, 'a.jpg')
requests.get requests.post
# get
for i in range(5):
param = {
'type': 'tv',
'tag':'热门',
'sort': 'recommend',
'page_limit': 20,
'page_start': i,
}
cont = requests.get(url, params=param).json()
print(cont)
# post
url = 'https://fanyi.baidu.com/sug'
wd = input('enter a word:')
data = {
'kw':wd
}
response = requests.post(url=url,data=data)
headers字典
headers = { 'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源) 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' }
获取requests爬取结果
content = requests.get(url, params=param)
a = content.text # 字符串
b = content.content # b''
c = content.json() # json格式
etree
有的时候找不到href对应的位置, 可以在返回的内容里面搜一下 .mp4 或者其他格式, 还可以搜一下自己按了链接点出来的href, 还有js里面也可能有
from lxml import etree import requests import base64 import random text = requests.get(url, headers).text # 获取网页文本信息 tree = etree.HTML(text) # etree的实例化 lis = tree.xpath('/html/body/div[5]/div[5]/div[1]/ul/li') # xpath解析 for el in lis: a = el.xpath('./div[2]/h2/a/text()') # 第二次解析时, 要在/div前加个. 否则会从html的最开始解析, 在不加text()时, 输出一个列表, 可以用[0]来获取 a = el.xpath('./div[3]//text()') print(a)
for el in lis:
a = el.xpath('./a/img/@src')[0] # xpath是个列表, 要用[0]来获取元素
print(a)
res = requests.get(url, headers)
res = res.text
res = res.encode('iso-8859-1').decode('gbk') # 不知道用什么解码时, 就用这个, 一般html的header中有charset
# 在一个网页的etree中, 获取适合多个解析式的内容
li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul/div[2]/li')
# 下载文件 data = requests.get(url=download_url,headers=headers).content fileName = name+'.rar' with open(fileName,'wb') as fp: fp.write(data)
并发
from multiprocessing.dummy import Pool pool = Pool(5) pool.map(getvideo, lst)
二维码识别
用云打码识别
import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text ###################################################################### # 用户名 username = 'username' # 密码 password = 'password' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 1 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = '22cc5376925e9387a23cf797cb9ba745' # 图片文件 filename = 'getimage.jpg' # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = 1004 # 超时时间,秒 timeout = 60 # 检查 if (username == 'username'): print('请设置好相关参数再测试') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print('uid: %s' % uid) # 查询余额 balance = yundama.balance(); print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) ######################################################################
session
session = requests.Session() session.get(url, params) session.post(login_url, data, headers) 和requests发请求相比, session会更耗内存, 但是session可以存储接收到的cookie
BeautifulSoup
soup = BeautifulSoup(content, 'lxml') a_list = soup.select('.book-mulu > ul > li > a') # 选择标签 list内部是soup对象
text = soup.find('div',class_='chapter_content').text # 获取子类和子类的子类...的text
使用流程: - 导包:from bs4 import BeautifulSoup - 使用方式:可以将一个html文档,转化为BeautifulSoup对象,然后通过对象的方法或者属性去查找指定的节点内容 (1)转化本地文件: - soup = BeautifulSoup(open('本地文件'), 'lxml') (2)转化网络文件: - soup = BeautifulSoup('字符串类型或者字节类型', 'lxml') (3)打印soup对象显示内容为html文件中的内容 方法: (1)根据标签名查找 - soup.a 只能找到第一个符合要求的标签 (2)获取属性 - soup.a.attrs 获取a所有的属性和属性值,返回一个字典 - soup.a.attrs['href'] 获取href属性 - soup.a['href'] 也可简写为这种形式 (3)获取内容 - soup.a.string - soup.a.text - soup.a.get_text() 【注意】如果标签还有标签,那么string获取到的结果为None,而其它两个,可以获取文本内容 (4)find:找到第一个符合要求的标签 - soup.find('a') 找到第一个符合要求的 - soup.find('a', title="xxx").text - soup.find('a', alt="xxx").content - soup.find('a', class_="xxx") - soup.find('a', id="xxx") (5)find_all:找到所有符合要求的标签 - soup.find_all('a') - soup.find_all(['a','b']) 找到所有的a和b标签 - soup.find_all('a', limit=2) 限制前两个 (6)根据选择器选择指定的内容 select:soup.select('#feng') - 常见的选择器:标签选择器(a)、类选择器(.)、id选择器(#)、层级选择器 - 层级选择器: div .dudu #lala .meme .xixi 下面好多级 div > p > a > .lala 只能是下面一级 【注意】select选择器返回永远是列表,需要通过下标提取指定的对象