zoukankan      html  css  js  c++  java
  • python -使用Requests库完成Post表单操作

    """
    使用Requests库完成Post表单操作
    """
    #_*_codingn:utf8 _*_
    import requests
    
    from bs4 import BeautifulSoup
    
    '''
      设置请求头,让程序发出的请求更像来源于浏览器
    '''
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
    
    if __name__ == "__main__":
    
        params ={"username": "anything","password": "password"}
    
        session =requests.session()
        post_obj = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
    
        s = session.get("http://pythonscraping.com/pages/cookies/profile.php")
        print(post_obj.text.encode("utf-8"))
        print(s.text.encode("utf-8"))
    
        #session.cookies.get_dict()  #获取cooking
        print(session.cookies.get_dict())
    # -*- coding: utf-8 -*-
    '''
    目标站点分析
    网页结构分析
    --开干--
    1、单页内容
    2、正则
    3、保存json
    4、多线程循环
    '''
    # .*具有贪婪的性质,首先匹配到不能匹配为止,根据后面的正则表达式,会进行回溯。
    # .*?(短)则相反,一个匹配以后,就往下进行,所以不会进行回溯,具有最小匹配的性质。
    # re.S 让.匹配换行符
    #----------------------------------
    import json
    import requests
    from requests.exceptions import RequestException
    import re
    import time
    from multiprocessing import Pool
    
    headers = {  # 非常重要
        'Accept-Language': 'en-US,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
        'Connection': 'keep-alive',
        'Referer': 'http://maoyan.com/board/6'
    }
    
    def get_one_page(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None  # 非200
        except RequestException:
            return None
    
    def parse_one_page(html):
        pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
                             + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
        items = re.findall(pattern, html)
        for item in items:
            yield {  # 变成生成器
                'index': item[0],
                'image': item[1],
                'title': item[2],
                'actor': item[3].strip()[3:],  # 字符串处理 (移除字符串头尾指定的字符序列)
                'time': item[4].strip()[5:],
                'score': item[5] + item[6]  # 分开匹配加起来
            }
    
    def write_to_file(content):
        with open('result.txt', 'a', encoding='utf-8') as f:  # 编码3
            f.write(json.dumps(content, ensure_ascii=False) + '
    ')  # json.dumps 序列化时对中文默认使用的ascii编码
    
    def main(offset):
        url = 'http://maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)  # return返回参数
        # print(html)
        for item in parse_one_page(html):
            # print(item)
            write_to_file(item)
    
    
    if __name__ == '__main__':
        for i in range(10):
            main(offset=i * 10)
            time.sleep(1)
        # 进程池
        # pool=Pool()
    
        # pool.map(main,[i*10 for i in range(10)])
    # coding=utf-8
    
    '''
    1、抓取索引页内容
    2、抓取详情页内容
    3、下载图片保存数据库
    4、循环及多线程
    '''
    
    import requests
    from requests.exceptions import RequestException
    from json import loads
    from bs4 import BeautifulSoup
    user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
    headers = {"User-Agent": user_agent}
    
    
    def get_onepage_index(i, keywords):
        data = {
            "offset": i,
            "format": "json",
            "keyword": keywords,
            "autoload": "true",
            "count": "20",
            "cur_tab": "1",
            "from": "search_tab"
        }
        url = 'https://www.toutiao.com/search_content/?'
        try:
            response = requests.get(url, params=data)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print('something is wrong!')
            return None
    
    
    def parse_onepage_index(html):
        # json.loads()用于将str类型的数据转成dict。
        data = loads(html)
        if data and 'data' in data.keys():  ##获取所有的key 值
            for item in data.get('data'):  # get() 函数返回指定键的值,如果值不在字典中返回默认值。
                yield item.get('article_url')
    
    
    def get_page_detail(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                # print(response.status_code)
                return response.text
            return None
        except RequestException:
            print('wrong url:', url)
            return None
    
    
    def parsepage(html):
        soup = BeautifulSoup(html, 'lxml')
        title = soup.title.string
        print(title)
    
    
    def main():
        for i in range(1, 2):
            i = str(i * 20)
            html = get_onepage_index(i, '街拍')
            parse_onepage_index(html)
            for url in parse_onepage_index(html):
                print(url)
                detailhtml = get_page_detail(url)  # 返回网页文本
                # print(detailhtml)
                if detailhtml == None:
                    pass
                else:
                    parsepage(detailhtml)  # bs4去解析
    
    
    # get_page_detail('http://toutiao.com/group/6596305324645286404/')
    
    if __name__ == '__main__':
          main()

    如有疑问,请留言。

    如觉得有帮助,请点个赞,谢谢!

  • 相关阅读:
    机器学习是什么
    Computer Vision的尴尬---by林达华
    机器学习算法与Python实践之(四)支持向量机(SVM)实现
    机器学习算法与Python实践之(三)支持向量机(SVM)进阶
    Hortonworks HDP Sandbox定制(配置)开机启动服务(组件)
    GCC单独编译host/examples/ tx_waveforms.cpp
    GDAL1.11版本号对SHP文件索引加速測试
    Tcl 简单介绍及特性
    Hardwood Species
    java整合easyui进行的增删改操作
  • 原文地址:https://www.cnblogs.com/chendezhen/p/9909804.html
Copyright © 2011-2022 走看看