zoukankan      html  css  js  c++  java
  • 爬虫项目汇总

    不登录下获取数据

     1 # coding=utf-8
     2 """
     3 用类封装爬虫任务,
     4 目的,获取豆瓣某地区安热度排列的全部电影
     5 思路:
     6     chorme分析目标url,
     7     构建url
     8     发请求获取数据
     9     保存数据
    10     循环上三步直到最后一页
    11 注意:目前代码中的url地址已经失效
    12 """
    13 import requests
    14 import json
    15 
    16 class DoubanSpider:
    17     def __init__(self):
    18         self.url_temp_list = [
    19             {
    20                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
    21                 "country": "US"
    22             },
    23             {
    24                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
    25                 "country": "UK"
    26             },
    27             {
    28                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
    29                 "country": "CN"
    30             }
    31         ]
    32         self.headers = {
    33             "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
    34             "Referer": "https://m.douban.com/movie/"
    35         }
    36 
    37     def parse_url(self, url):  # 发送请求,获取响应
    38         print(url)
    39         response = requests.get(url, headers=self.headers)
    40         return response.content.decode()
    41 
    42     def get_content_list(self, json_str):  # 提取数据
    43         dict_ret = json.loads(json_str)
    44         content_list = dict_ret["subject_collection_items"]
    45         total = dict_ret["total"]  # 代表总数量  不一定正确
    46         return content_list, total
    47 
    48     def save_content_list(self, content_list,country):  # 保存
    49         with open("douban.txt", "a", encoding="utf-8") as f:
    50             for content in content_list:
    51                 content["country"] = country
    52                 f.write(json.dumps(content, ensure_ascii=False))
    53                 f.write("
    ")  # 写入换行符,进行换行
    54         print("保存成功")
    55 
    56     def run(self):  # 实现主要逻辑
    57         for url_temp in self.url_temp_list:
    58             num = 0 # num是url中的start参数,表示起始页
    59             total = 100  # 假设有第一页
    60             while num < total + 18: # 不能等于,因为等于意味着上一次已经把最后一页取完了
    61                 # 1.start_url
    62                 url = url_temp["url_temp"].format(num)
    63                 # 2.发送请求,获取响应
    64                 json_str = self.parse_url(url)
    65                 # 3.提取是数据
    66                 content_list, total = self.get_content_list(json_str)
    67 
    68                 # 4.每一页都保存一下,而不是全部获取后再保存,防止中间出问题了,前面获取的都白费了。
    69                 self.save_content_list(content_list,url_temp["country"])
    70                 # if len(content_list)<18: # 这种方式判断是否取到尾也可以
    71                 #     break
    72                 # 5.构造下一页的url地址,进入循环
    73                 num += 18
    74 
    75 
    76 if __name__ == '__main__':
    77     douban_spider = DoubanSpider()
    78     douban_spider.run()
    01.豆瓣获取最热电影信息
     1 # coding=utf-8
     2 import requests
     3 from retrying import retry
     4 
     5 headers = {
     6     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
     7 }
     8 
     9 
    10 @retry(stop_max_attempt_number=3)
    11 def _parse_url(url, method, data, proxies):
    12     print("*" * 20)
    13     if method == "POST":
    14         response = requests.post(url, data=data, headers=headers, proxies=proxies)
    15     else:
    16         response = requests.get(url, headers=headers, timeout=3, proxies=proxies)
    17     assert response.status_code == 200
    18     return response.content.decode()
    19 
    20 
    21 def parse_url(url, method="GET", data=None, proxies={}):
    22     try:
    23         html_str = _parse_url(url, method, data, proxies)
    24     except Exception as e:
    25         html_str = None
    26 
    27     return html_str
    28 
    29 
    30 if __name__ == '__main__':
    31     url = "www.baidu.com"
    32     print(parse_url(url))
    02-1.通用贴吧爬虫-parse_url.py
     1 from parse_url import parse_url
     2 from lxml import etree
     3 import json
     4 """
     5 爬取任意贴吧 中帖子列表中的题目,评论数等,以及么个帖子中详情中每一页的图片,保存在文件中
     6 爬取时间:2019/3
     7 """
     8 
     9 class TieBa:
    10     def __init__(self, name):
    11         self.name = name
    12         self.start_url = f"https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn=0"
    13         self.root_url = "https://tieba.baidu.com"
    14 
    15     def etree_get_content(self, text):
    16         html = etree.HTML(text)
    17         li_list = html.xpath("//li[@class=' j_thread_list clearfix']")  # 获取内容分组
    18         data = []
    19         for i in li_list:
    20             # print(etree.tostring(i).decode())
    21             item = {}
    22             item["title"] = i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()")[0] if i.xpath(
    23                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()") else None
    24             item["comments"] = i.xpath(".//span[@class='threadlist_rep_num center_text']/text()")[0] if i.xpath(
    25                 ".//span[@class='threadlist_rep_num center_text']/text()") else None
    26             item["href"] = self.root_url + i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href")[
    27                 0] if i.xpath(
    28                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href") else None
    29             item["imgs"] = self.get_img_list(item["href"], [])
    30             data.append(item)
    31         next_url = html.xpath(".//a[text()='下一页>']/@href")[0] if html.xpath(".//a[text()='下一页>']/@href") else None
    32         return data, "https:" + next_url
    33 
    34     def pre_html(self, text):
    35         """
    36         去除注释符号
    37         :return:
    38         """
    39         text = text.replace('<!--', '')
    40         return text.replace('--!>', '')
    41 
    42     def get_img_list(self, next_url, container):
    43         """递归爬取每一页贴吧详情页中发帖的图片"""
    44         if next_url is None:
    45             return container
    46         detail_content = parse_url(next_url)
    47         # 提取该页内容,提取下一页
    48         html = etree.HTML(detail_content)
    49         img_list = html.xpath("//img[@class='BDE_Image']/@src")  # 获取图片src 列表
    50         container.extend(img_list)
    51         next_url = html.xpath(".//a[text()='下一页']/@href")  # 下一页href
    52         if next_url:  # 不为空列表
    53             # 这里return写不写都行,因为container是列表,可变类型,下一步return container最后返回也行,但是建议写上,这样除了最后一次循环外,每层都少走一步
    54             return self.get_img_list(self.root_url + next_url[0], container)
    55         return container
    56 
    57     def save_content_dict(self, data):
    58         file_path = self.name + ".txt"
    59         with open(file_path, 'a+', encoding='utf8') as f:
    60             for dd in data:
    61                 f.write(json.dumps(dd, ensure_ascii=False))
    62                 f.write('
    ')
    63 
    64     def run(self):
    65         # 第一页
    66         next_url = self.start_url
    67         # 循环获取每一页
    68         while next_url is not None:
    69             html_str = parse_url(next_url)
    70             # 请求信息预处理
    71             html_str = self.pre_html(html_str)
    72             # 提取该页内容,提取下一页
    73             data, next_url = self.etree_get_content(html_str)
    74             print(data)
    75             self.save_content_dict(data)
    76 
    77 
    78 if __name__ == '__main__':
    79     name = input("请输入贴吧名称:").strip()
    80     ba = TieBa(name)
    81     ba.run()
    02-2.通用贴吧爬虫-main.py
     1 """
     2 抓取https://www.qiushibaike.com所有热门 中数据
     3 爬取时间:2019/4
     4 """
     5 from parse_url import parse_url
     6 from lxml import etree
     7 import json
     8 
     9 
    10 class QiuShi:
    11 
    12     def __init__(self):
    13         self.start_url = "https://www.qiushibaike.com/8hr/page/{}/"  # 根据规律构建全部url地址
    14         self.part_url = "https://www.qiushibaike.com"
    15 
    16     def etree_get_content(self, text):
    17         etree_elemnt = etree.HTML(text)
    18         # 先分组
    19         content_list = etree_elemnt.xpath("//div[@class='recommend-article']/ul/li")
    20         data = []
    21         for li in content_list:
    22             item = {}
    23             try:
    24                 item['title'] = li.xpath(".//a[@class='recmd-content']/text()")[0] if li.xpath(".//a[@class='recmd-content']/text()") else None
    25                 item['href'] = self.part_url + li.xpath(".//a[@class='recmd-content']/@href")[0] if li.xpath(".//a[@class='recmd-content']/@href") else None
    26                 item['laugh_num'] = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
    27                 item['comment_num'] = li.xpath(".//div[@class='recmd-num']/span[4]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
    28             except Exception as e:
    29                 print(e)
    30                 continue
    31             data.append(item)
    32         return data
    33 
    34     def save_content_dict(self, data):
    35         file_path = "糗事百科热门.txt"
    36         with open(file_path, 'a+', encoding='utf8') as f:
    37             for dd in data:
    38                 f.write(json.dumps(dd, ensure_ascii=False))
    39                 f.write('
    ')
    40 
    41     def run(self):
    42         # 构建url地址列表
    43         for i in range(1, 14):
    44             # 获取每一页目标响应
    45             html_str = parse_url(self.start_url.format(i))
    46             # 解析页面
    47             data = self.etree_get_content(html_str)
    48             # 每一页保存一次
    49             self.save_content_dict(data)
    50 
    51 
    52 if __name__ == '__main__':
    53     q = QiuShi()
    54     q.run()
    03.爬取qiushibaike-单线程
     1 """
     2 多线程 抓取https://www.qiushibaike.com所有热门 中数据,
     3 但是该网站布局已经改版了,部分xpath解析的位置已经不在了
     4 爬取时间:2017/10
     5 """
     6 import requests
     7 from lxml import etree
     8 import threading
     9 from queue import Queue
    10 
    11 
    12 class QiubaiSpdier:
    13     def __init__(self):
    14         self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
    15         self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    16         self.url_queue = Queue()
    17         self.html_queue  = Queue()
    18         self.content_queue = Queue()
    19 
    20     def get_url_list(self):
    21         # return [self.url_temp.format(i) for i in range(1,14)]
    22         for i in range(1,4):
    23             self.url_queue.put(self.url_temp.format(i))
    24 
    25     def parse_url(self):
    26         while True:
    27             url = self.url_queue.get()
    28             print(url)
    29             response = requests.get(url,headers=self.headers)
    30             self.html_queue.put(response.content.decode())
    31             self.url_queue.task_done() # 注意必须url的get并处理好url的响应put到对应的队列后,再调用url的task_done使计数减一
    32 
    33     def get_content_list(self): # 提取数据
    34         while True:
    35             html_str = self.html_queue.get()
    36 
    37             html = etree.HTML(html_str)
    38             div_list = html.xpath("//div[@id='content-left']/div")  #分组
    39             content_list = []
    40             for div in div_list:
    41                 item= {}
    42                 item["content"] = div.xpath(".//div[@class='content']/span/text()")
    43                 item["content"] = [i.replace("
    ","") for i in item["content"]]
    44                 item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
    45                 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
    46                 item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
    47                 item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
    48                 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
    49                 item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
    50                 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
    51                 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
    52                 content_list.append(item)
    53             self.content_queue.put(content_list)
    54             self.html_queue.task_done() # 注意task_done放在put后面,确保get的结果处理完并且已经put都对应的队列中
    55 
    56     def save_content_list(self): # 保存
    57         while True:
    58             content_list = self.content_queue.get()
    59             for i in content_list:
    60                 # print(i)
    61                 pass
    62             self.content_queue.task_done()
    63 
    64     def run(self): #实现主要逻辑
    65         thread_list = []
    66         #1.url_list
    67         t_url = threading.Thread(target=self.get_url_list)
    68         thread_list.append(t_url)
    69         #2.遍历,发送请求,获取响应
    70         for i in range(20):
    71             t_parse = threading.Thread(target=self.parse_url)
    72             thread_list.append(t_parse)
    73         #3.提取数据
    74         for i in range(2):
    75             t_html = threading.Thread(target=self.get_content_list)
    76             thread_list.append(t_html)
    77         #4.保存
    78         t_save = threading.Thread(target=self.save_content_list)
    79         thread_list.append(t_save)
    80         for t in thread_list:
    81             t.setDaemon(True) # 把子线程设置为守护线程,主线程结束,子线程结束
    82             t.start()
    83 
    84         for q in [self.url_queue,self.html_queue,self.content_queue]:
    85             # 调用此方法让主线程阻塞,直到队列中所有的项目均被处理。阻塞将持续到队列中的每个项目均调用q.task_done()方法为止
    86             q.join()
    87 
    88 if __name__ == '__main__':
    89     qiubai = QiubaiSpdier()
    90     qiubai.run()
    91     print("主线程结束")
    03.爬取qiushibaike-多线程
     1 """
     2 第一页:
     3 Request URL: https://www.douyu.com/directory/all
     4 Request Method: GET
     5 
     6 
     7 第二页:
     8 Request URL: https://www.douyu.com/gapi/rkc/directory/0_0/2
     9 Request Method: GET
    10 
    11 爬取时间:2019/4
    12 """
    13 import json
    14 import time
    15 from retrying import retry
    16 from selenium import webdriver
    17 from selenium.webdriver.chrome.options import Options
    18 
    19 chrome_options = Options()
    20 chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    21 
    22 class DouYuSpider:
    23     def __init__(self):
    24         self.start_url = 'https://www.douyu.com/directory/all'
    25         self.driver = webdriver.Chrome(chrome_options=chrome_options)
    26 
    27     def save_content_dict(self, data):
    28         file_path = 'douyu-room' + ".txt"
    29         with open(file_path, 'a+', encoding='utf8') as f:
    30             for dd in data:
    31                 f.write(json.dumps(dd, ensure_ascii=False))
    32                 f.write('
    ')
    33             f.flush()
    34 
    35     @retry(stop_max_attempt_number=3)
    36     def get_next_page_click(self):
    37         next_page = self.driver.find_elements_by_xpath("//li[@class=' dy-Pagination-next']/span")
    38         # 最后一页的 下一页 父元素 class=‘dy-Pagination-disabled dy-Pagination-next’ 表示不可点击了
    39         if len(next_page) == 0:
    40             return -1
    41         else:
    42             next_page[0].click()
    43 
    44     def get_single_page(self):
    45         # 先分组
    46         room_list = self.driver.find_elements_by_xpath(
    47             "//div[@class='layout-Module-container layout-Cover ListContent']/ul/li")
    48         data = []
    49         for room in room_list:
    50             item = {}
    51             item['title'] = room.find_element_by_xpath(".//h3[@class='DyListCover-intro']").text
    52             item['zone'] = room.find_element_by_xpath(".//span[@class='DyListCover-zone']").text
    53             # item['img'] = room.find_element_by_xpath(".//img[@class='DyImg-content is-normal']").get_attribute(
    54             #     'src')
    55             item['anchor_name'] = room.find_element_by_xpath(".//h2[@class='DyListCover-user']").text
    56             data.append(item)
    57         return data
    58 
    59 
    60     def run(self):
    61         # 第一页
    62         self.driver.get(self.start_url)
    63         self.driver.implicitly_wait(12)
    64 
    65         while True:
    66             # 获取每一页的页面结构化数据
    67             data = self.get_single_page()
    68             # 保存数据
    69             self.save_content_dict(data)
    70             # 查找下一页url,并点击
    71             try:
    72                 ret = self.get_next_page_click()
    73                 time.sleep(2) # 等待页面加载完全
    74                 if ret == -1:
    75                     break
    76             except Exception as e:
    77                 print(e)
    78 
    79         self.driver.quit()
    80 
    81 
    82 if __name__ == '__main__':
    83     douyu = DouYuSpider()
    84     douyu.run()
    85 
    86 """
    87 优化建议:
    88 1.把每一页的self.driver.page_source 页面字符串传给lxml的etree去处理
    89 2.staleness_of 尝试失败 https://www.mlln.cn/2018/05/22/python-selenium如何在点击后等待页面刷新
    90 
    91 """
    04.爬取斗鱼直播房间信息

    自动登录案例

     1 """
     2 套路:登录首页的时候,已经给浏览器设置cookies,此时未激活
     3 登录成功后返回假的cookies,激活未激活的cookies,
     4 
     5 """
     6 import requests
     7 from bs4 import BeautifulSoup
     8 
     9 headers = {
    10     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    11 }
    12 
    13 index = requests.get("https://dig.chouti.com/", headers=headers)
    14 cookies = index.cookies.get_dict()
    15 
    16 
    17 # ===========================点赞=================
    18 
    19 # 1.登录
    20 login = requests.post(
    21     "https://dig.chouti.com/login",
    22     data={
    23         "phone": 8615026809593,
    24         "password":'dajiahaa',
    25     },
    26     headers=headers,
    27     cookies=cookies)
    28 
    29 # 2.点赞
    30 dizan = requests.post(
    31     url="https://dig.chouti.com/link/vote?linksId=25389911",
    32     cookies=cookies,
    33     headers=headers)
    34 
    35 print(dizan.text)
    01.抽屉网
     1 """
     2 套路:
     3 - 带请求头
     4 - 带cookie
     5 - 请求体中:
     6     commit:Sign in
     7     utf8:✓
     8     authenticity_token:放在页面隐藏表单中
     9     login:asdfasdfasdf
    10     password:woshiniba8
    11 
    12 """
    13 import requests
    14 from bs4 import BeautifulSoup
    15 
    16 headers = {
    17     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    18 }
    19 
    20 login = requests.get(
    21     "https://github.com/login",
    22     headers=headers,
    23 )
    24 cookies = login.cookies.get_dict()
    25 login_par = BeautifulSoup(login.content, 'html.parser')
    26 token_input = login_par.find(name='input', attrs={"name": "authenticity_token"})
    27 
    28 authenticity_token = token_input.attrs.get("value")
    29 # 1.登录
    30 re_login = requests.post(
    31     "https://github.com/session",
    32     data={
    33         "commit": "Sign in",
    34         "utf8":"",
    35         "login": "cpcp@163.com",
    36         "password": 'cs11187',
    37         "authenticity_token": authenticity_token,
    38         "webauthn-support": "supported"
    39     },
    40     cookies=cookies,
    41     headers={
    42         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    43         "Referer": "https://github.com/login"
    44     }
    45 )
    46 
    47 print(re_login.text)
    02.github
  • 相关阅读:
    HDU 1950 Bridging signals(LIS)
    PKU 1094 Sorting It All Out(拓扑排序)
    中国剩余定理(孙子定理)详解
    51Nod 1079
    翻转游戏
    不构造树的情况下验证先序遍历
    图说流程管理
    从架构到流程
    POS(Plan Operation Support 和 OES(Operation Enable Support)
    流程规划方法→POS法
  • 原文地址:https://www.cnblogs.com/carlous/p/10624842.html
Copyright © 2011-2022 走看看