一,requests发请求
s = requests.Session() payload = {'key1': 'value1', 'key2': 'value2'} proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8') # headers 请求头 # data post请求数据 # verify ssl安全认证 # allow_redirects 重定向 # proxies 设置代理 requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8') re=requests.post(url, headers=headers, data=data,verify = False) # 获取cookie requests.utils.dict_from_cookiejar(re.cookies) requests.get('https://github.com', timeout=2) # 设置超时时间 timeout 设置单一的值,将会用作 connect 和 read 二者的 timeout。 requests.get('https://github.com', timeout=(3.05, 27)) # 如果要分别制定,就需要传入一个元组。(connect, read)的超时时间 requests.get('https://github.com', timeout=None) # 如果需要让 request 永远等待,则传入一个 None 作为 timeout 的值。
二,requests
import requests kw = {'wd':'长城'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} # params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode() response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers) # 查看响应内容,response.text 返回的是Unicode格式的数据 print(response.text) # 查看响应内容,response.content返回的字节流数据 print(respones.content) # 查看完整url地址 print(response.url) # 查看响应头部字符编码 print(response.encoding) # 查看响应码 print(response.status_code)
三,python爬虫POST request payload形式的请求
import requests import json payloadHeader = { 'Host': 'sellercentral.amazon.com', 'Content-Type': 'application/json', } requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)
四,字典格式存数据库,要求数据库字段和字典格式字段一样
class MogujiePipeline(object): def __init__(self): # 创建数据库连接 self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') # self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58', # user='spider58', # password='58spider@123', # charset='utf8') self.cursor = self.db.cursor() def process_item(self, item, spider): # 判断爬取的字段数据库中是否已经存在 print(f'select id from mogujie where clientUrl={item["clientUrl"]}') num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"])) if not num: list_key = [] list_lalues = [] for key, lalues in item.items(): list_key.append(key) list_lalues.append("'" + str(lalues).replace("'", "‘") + "'") # 拼接sql语句 insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key), ', '.join(list_lalues)) print('insert_sql:', insert_sql) self.cursor.execute(insert_sql) self.db.commit() return item def close_spider(self, spider): # 关闭数据库的连接 self.cursor.close() self.db.close()
五.爬起json数据
import requests import json import pymysql import logging logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename='yibao.log', # log文件名 filemode='a') # 写入模式“w”或“a” class yibao(object): def __init__(self): self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') self.cursor = self.db.cursor() self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html' self.parse_page() def parse_page(self): data = { 'operationId': 'icdIds', } html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8') data_json = json.loads(html) for data in data_json: num = self.cursor.execute('select id from catalogue where id={}'.format()) if not num: # 插入数据 self.cursor.execute( 'insert into catalogue() values()'.format()) self.db.commit() # 查询数据 self.cursor.execute("select * from catalogue") data = self.cursor.fetchone() data = self.cursor.fetchall() # 更新数据 self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format()) self.db.commit() # 删除数据 self.cursor.execute("delete from catalogue where id={}".format()) self.db.commit() if __name__ == '__main__': yibao()
六.HTML数据
import requests import json import time import pymysql import logging import random from lxml import etree logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename='yibao.log', # log文件名 filemode='a') # 写入模式“w”或“a” class yibao(object): def __init__(self): self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') self.cursor = self.db.cursor() self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html' self.parse_page() def parse_page(self): data = { 'operationId': 'icdIds', } html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8') etree_html = etree.HTML(html) data = etree_html.xpath( '//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()') datas = etree_html.xpath( '//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()') for i in range(len(data)): num = self.cursor.execute('select id from catalogue where id={}'.format()) if not num: # 插入数据 self.cursor.execute( 'insert into catalogue() values()'.format()) self.db.commit() # 查询数据 self.cursor.execute("select * from catalogue") data = self.cursor.fetchone() data = self.cursor.fetchall() # 更新数据 self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format()) self.db.commit() # 删除数据 self.cursor.execute("delete from catalogue where id={}".format()) self.db.commit() if __name__ == '__main__': yibao()
七.使用代理
proxies = { "http": "http://ip:端口号", "https": "https://ip:端口号", } request.get(url, proxies=proxies) proxies = { "http": "http://username:password@ip:端口号", "https": "https://username:password@ip:端口号", } request.get(url, proxies=proxies)
.replace("'", "’")