一、背景
每次使用python的requests构建get请求爬虫时,都需要从零开始编写,效率太低。需要探索一种快速生成编码的方法。
二、分析方案
现将编码总结好,使用的时候取回来拼凑,改造。时间久了,积累多了,就做一个大项目的总结。
三、实现方法
(一)第一步:快速构建请求头(heads)
(二)第二步:快速构建请求过程(requests的get请求)
(三)第三步:快速解析数据(html、json等)并存储文件
(一)第一步:快速构建请求头(heads)
def denghao2maohao(cookie_str):
# 截断数据对
list1 = cookie_str.split(";")
# print(list1)
# 初始化字典
cookie_dict_str = {}
for item in list1:
list2 = item.split("=", 1) # 按照等号只切割一次
# print(list2)
dict_key = list2[0].strip()
dict_value = list2[1].strip()
cookie_dict_str[dict_key] = dict_value
return cookie_dict_str
def maohao2yinhao(maohao_str):
list1 = maohao_str.strip().splitlines()
maohao_str_dict = {}
for item in list1:
if item.strip().startswith(":"):
# print(item.strip())
list2 = item.strip().split(":", 2) # 按照分号截断2次
# print(list2)
new_key = list2[1]
new_value = list2[2].strip()
# maohao_str_dict[":" + new_key] = new_value # 保留首冒号
maohao_str_dict[new_key] = new_value # 删除首冒号
print("'%s':'%s'," % (new_key, new_value))
else:
# print(item)
list2 = item.split(":", 1) # 按照分号截断1次
maohao_str_dict[list2[0].strip()] = list2[1].strip()
new_key = list2[0].strip()
new_value = list2[1].strip()
maohao_str_dict[new_key] = new_value
print("'%s':'%s'," % (new_key, new_value)) # 输出格式化好的键值对
return maohao_str_dict
if __name__ == '__main__':
# # cookie中,等号转为冒号
# cookie_str = "ss_lang=cs; product=WGSN; ss_udid=0ed9a26e6dd6bb892c796cda69bca4a3; PHPSESSID=ci56j78njjgdqde5tjepslaah5; exclusionChecked=True; ss_token=f77dcbc5a65f43977e02b61e9d6ff947; trwv.uid=stylesight-1525165098107-fd45157e%3A2; trwsa.sid=stylesight-1525177471085-3d01fa38%3A2; _ga=GA1.2.1824486173.1525165097; _gid=GA1.2.1794994253.1525165097; cp_browStat=Logged In; cp_UserID=-1; cp_hybridBrowStat=Logged In; cp_SubStat=Subscriber"
# # print(cookie_str)
# cookie_dict_str = denghao2maohao(cookie_str)
# print("======【1】 cookie等号转为冒号 ========")
# print(cookie_str)
# print()
# print(cookie_dict_str)
# 请求中,冒号添引号,删除首冒号
maohao_str = """
:authority:www.wgsnchina.cn
:method:POST
:path:/api/cherry/search/query
:scheme:https
accept:application/json, text/plain, */*
accept-encoding:gzip, deflate, br
accept-language:zh-CN,zh;q=0.9
content-length:149
content-type:application/json;charset=UTF-8
cookie:ss_lang=cs; product=WGSN; ss_udid=0ed9a26e6dd6bb892c796cda69bca4a3; PHPSESSID=ci56j78njjgdqde5tjepslaah5; exclusionChecked=True; ss_token=f77dcbc5a65f43977e02b61e9d6ff947; _gat_UA-1004012-2=1; cp_SubStat=Subscriber; cp_browStat=Logged In; cp_UserID=-1; cp_hybridBrowStat=Logged In; _dc_gtm_UA-1004012-2=1; _ga=GA1.2.1824486173.1525165097; _gid=GA1.2.1794994253.1525165097; trwv.uid=stylesight-1525165098107-fd45157e%3A3; trwsa.sid=stylesight-1525179968287-e61a7bc2%3A2
origin:https://www.wgsnchina.cn
referer:https://www.wgsnchina.cn/library/results/ab745207e8ed3dcfa16b4814748beead
user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36
"""
print("======【2】 请求中,冒号添引号,删除首冒号 ========")
maohao_str_dict = maohao2yinhao(maohao_str)
# print(maohao_str)
print()
print(maohao_str_dict)
代码原文:https://www.cnblogs.com/andy9468/p/8977406.html
(二)第二步:快速构建请求过程(requests的get请求)
url = 'https://www.baidu.com'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
# 'cookie': 'xxx;yyy;zzz',
# 'referer': 'https://xxx.yyy.zzz'
}
# 发起get请求
response = requests.get(url, headers=headers, verify=True)
# 获取html文本
html_data = response.content.decode()
print(html_data)
print(len(html_data))
代码原文:https://www.cnblogs.com/andy9468/p/11492910.html
(三)第三步:快速解析数据(html、json等)并存储文件
1、html解析
from lxml import etree
# 获取全部有意义正文文本
html_str="""<div>hah<a>六六六</a>cccc收拾收拾</div>"""
html_etree = etree.HTML(html_str) # 获取element 类型的html
all_content = html_etree.xpath('string(.)').strip()
print(all_content)
代码原文:https://www.cnblogs.com/andy9468/p/10144867.html
xpath语法:
https://www.cnblogs.com/andy9468/p/10144867.html
html解析类封装:
https://www.cnblogs.com/andy9468/p/8060372.html
2、json解析
json字串转dict字典:
dict_data=json.loads(json_data)
代码原文:https://www.cnblogs.com/andy9468/p/8252897.html
3、数据存入普通文本
with写文件:
save_file = "1.txt"
str_data = "123a
bc"
with open(save_file, 'a', encoding="utf-8") as f:
f.write(str_data)
代码原文:https://www.cnblogs.com/andy9468/p/11493062.html
4、数据存入Excel表格
写入Excel表格实例代码:
from openpyxl import Workbook
def main():
sheet_name = "表名1"
row_count = 6 # 行数
info_result = []
page = 1
while page <= row_count:
info = ['a', 'b', 'c'] # 每行的内容
info_result.append(info)
page += 1
# 写入Excel表格
wb = Workbook()
ws1 = wb.active
ws1.title = sheet_name # sheet名称
for row in info_result:
ws1.append(row)
wb.save('拉钩职位信息.xls') # Excel文件名称,保存文件
if __name__ == '__main__':
main()