zoukankan      html  css  js  c++  java
  • requests

    一,requests发请求

    s = requests.Session()
    payload = {'key1': 'value1', 'key2': 'value2'}
    proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'}
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8')
    # headers 请求头
    # data post请求数据
    # verify ssl安全认证
    # allow_redirects 重定向
    # proxies 设置代理
    requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8')
    re=requests.post(url, headers=headers, data=data,verify = False)
    # 获取cookie
    requests.utils.dict_from_cookiejar(re.cookies)
    
    requests.get('https://github.com', timeout=2)   # 设置超时时间  timeout 设置单一的值,将会用作 connect 和 read 二者的 timeout。
    requests.get('https://github.com', timeout=(3.05, 27))   # 如果要分别制定,就需要传入一个元组。(connect, read)的超时时间
    requests.get('https://github.com', timeout=None)   # 如果需要让 request 永远等待,则传入一个 None 作为 timeout 的值。

    二,requests 

    import requests
    
    kw = {'wd':'长城'}
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
     
    
    
    # params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode()
    response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers)
    
    # 查看响应内容,response.text 返回的是Unicode格式的数据
    print(response.text)
    
    # 查看响应内容,response.content返回的字节流数据
    print(respones.content)
    
    # 查看完整url地址
    print(response.url)
    
    # 查看响应头部字符编码
    print(response.encoding)
    
    # 查看响应码
    print(response.status_code)

    三,python爬虫POST request payload形式的请求

    import requests
    import json
    
    payloadHeader = {
        'Host': 'sellercentral.amazon.com',
        'Content-Type': 'application/json',
    }
    requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)
    

    四,字典格式存数据库,要求数据库字段和字典格式字段一样

    class MogujiePipeline(object):
        def __init__(self):
            # 创建数据库连接
            self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                      charset='utf8')
            # self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58',
            #                           user='spider58',
            #                           password='58spider@123',
            #                           charset='utf8')
            self.cursor = self.db.cursor()
    
        def process_item(self, item, spider):
            # 判断爬取的字段数据库中是否已经存在
            print(f'select id from mogujie where clientUrl={item["clientUrl"]}')
            num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"]))
            if not num:
                list_key = []
                list_lalues = []
                for key, lalues in item.items():
                    list_key.append(key)
                    list_lalues.append("'" + str(lalues).replace("'", "‘") + "'")
                    # 拼接sql语句
                insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key),
                                                                         ', '.join(list_lalues))
                print('insert_sql:', insert_sql)
                self.cursor.execute(insert_sql)
                self.db.commit()
    
            return item
    
        def close_spider(self, spider):
            # 关闭数据库的连接
            self.cursor.close()
            self.db.close()
    

    五.爬起json数据

    import requests
    import json
    import pymysql
    import logging
    
    logging.basicConfig(
        level=logging.INFO,  # 定义输出到文件的log级别,大于此级别的都被输出
        format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
        datefmt='%Y-%m-%d %H:%M:%S',  # 时间
        filename='yibao.log',  # log文件名
        filemode='a')  # 写入模式“w”或“a”
    
    
    class yibao(object):
        def __init__(self):
            self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
            self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
    
            self.parse_page()
    
        def parse_page(self):
            data = {
                'operationId': 'icdIds',
            }
            html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
            data_json = json.loads(html)
            for data in data_json:
                num = self.cursor.execute('select id from catalogue where id={}'.format())
                if not num:
                    # 插入数据
                    self.cursor.execute(
                        'insert into catalogue() values()'.format())
                    self.db.commit()
    
                    # 查询数据
                    self.cursor.execute("select * from catalogue")
                    data = self.cursor.fetchone()
                    data = self.cursor.fetchall()
    
                    # 更新数据
                    self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
                    self.db.commit()
    
                    # 删除数据
                    self.cursor.execute("delete from catalogue where id={}".format())
                    self.db.commit()
    
    
    if __name__ == '__main__':
        yibao()
    

    六.HTML数据

    import requests
    import json
    import time
    import pymysql
    import logging
    import random
    from lxml import etree
    
    logging.basicConfig(
        level=logging.INFO,  # 定义输出到文件的log级别,大于此级别的都被输出
        format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
        datefmt='%Y-%m-%d %H:%M:%S',  # 时间
        filename='yibao.log',  # log文件名
        filemode='a')  # 写入模式“w”或“a”
    
    
    class yibao(object):
        def __init__(self):
            self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
            self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
    
            self.parse_page()
    
        def parse_page(self):
            data = {
                'operationId': 'icdIds',
            }
            html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
            etree_html = etree.HTML(html)
            data = etree_html.xpath(
                '//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()')
            datas = etree_html.xpath(
                '//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()')
            for i in range(len(data)):
                num = self.cursor.execute('select id from catalogue where id={}'.format())
                if not num:
                    # 插入数据
                    self.cursor.execute(
                        'insert into catalogue() values()'.format())
                    self.db.commit()
    
                    # 查询数据
                    self.cursor.execute("select * from catalogue")
                    data = self.cursor.fetchone()
                    data = self.cursor.fetchall()
    
                    # 更新数据
                    self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
                    self.db.commit()
    
                    # 删除数据
                    self.cursor.execute("delete from catalogue where id={}".format())
                    self.db.commit()
    
    
    if __name__ == '__main__':
        yibao()
    

    七.使用代理

    proxies = {
        "http": "http://ip:端口号",
        "https": "https://ip:端口号",
    }
    request.get(url, proxies=proxies)
    
    proxies = {
        "http": "http://username:password@ip:端口号",
        "https": "https://username:password@ip:端口号",
    }
    request.get(url, proxies=proxies)
    

      

     

      

      

    .replace("'", "’")
  • 相关阅读:
    技巧:在Silverlight中如何访问外部xap文件中UserControl
    Silverlight:代码隐藏和分部类
    silverlight 反射调用WebService
    [推荐]Silverlight 2 开发者海报
    轻松建立Silverlight开发环境
    windows 卸载IE8还原IE7的方法
    如何:访问双工服务
    技巧:在Silverlight 2应用程序中切换用户控件
    如何减少silverlight XAP包的尺寸
    Silverlight:应用程序和编程模型
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/10132237.html
Copyright © 2011-2022 走看看