zoukankan      html  css  js  c++  java
  • 【Python学习】爬虫源码

    1、在巨人的肩膀上,结合网上的资源,梳理出来的。

    2、其中应用了几个常用的包,requests、re等,

    3、注意创建多级文件夹要用--makesdir,创建单级文件用--mkdir

      1 # 导入相应的包
      2 # 请求网页
      3 import requests
      4 # 正则解析网页
      5 import re
      6 # 告诉服务,自己的身份,
      7 import time
      8 
      9 import os
     10 
     11 # 函数请求的头部信息
     12 headers = {
     13     "user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'
     14 }
     15 
     16 ############################################  开始编写程序  ############################################
     17 # 定义轮询的页面数
     18 polling_page_num = int(input("请输入主页页码轮询的数量:"))
     19 
     20 ############################################  函数区  ############################################
     21 
     22 # 下载界面html代码的函数
     23 def download_page(url, html_encode='utf-8', *args, **kwargs):
     24     """
     25     下载界面html代码的函数
     26     :param url:需要下载网页代码的链接
     27     :param html_encode:网页的解码方式。默认是“utf-8”
     28     :param args:
     29     :param kwargs:
     30     :return:返回值为该页面的html代码
     31     """
     32     headers = {
     33         "user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'}
     34     r = requests.get(url, headers=headers)
     35     r.encoding = html_encode
     36     response_status = r.status_code
     37     return r.text, response_status
     38 
     39 
     40 # 分析主网页并且返回已经处理后的链接列表
     41 def anasy_main_page(html, *args, **kwargs):
     42     ex = '<a href="(.*?)" title=".*?"><img alt=".*?" src=".*?"><i>.*?</i></a>'
     43     # ex = '<a href="(.*?)" title=.*?><img alt=.*? src=.*?><i>.*?</i></a>'
     44     test_src_list = re.findall(ex, html, re.S)
     45     new_src_list = test_src_list[1:31]
     46     li_piclink = []
     47     for pic_link in new_src_list:
     48         new_pic_link = '链接' + pic_link
     49         li_piclink.append(new_pic_link)
     50     return li_piclink
     51 
     52 
     53 # 分析副网页,返回图片的下载地址
     54 def anasy_Secondary_page(Secondary_html):
     55     """
     56     :param Secondary_html:
     57     :return: 一个元组,
     58             dir_name --文件夹名称
     59             pic_link --- 链接
     60     """
     61     ex_link = '<img  alt=".*?" src="(.*?)"  />'
     62     ex_name = '<h1>(.*?)</h1>'  # 需要修改的地方
     63     pic_link = re.findall(ex_link, Secondary_html, re.S)[0]
     64     dir_name = re.findall(ex_name, Secondary_html, re.S)[0]
     65     return dir_name, pic_link
     66 
     67 
     68 # 创建文件夹
     69 def create_folder(dir_name):
     70     dir_name_cl = "".join(dir_name.split())
     71     dir_name = dir_name_cl
     72     if not os.path.exists(dir_name):
     73         os.mkdir(dir_name)
     74     return dir_name
     75 
     76 
     77 # 下载图片
     78 def down_pic(dir_name, pic_link):
     79     """
     80     :param dir_name:
     81     :param pic_link:
     82     :return:
     83     """
     84     img_data = requests.get(url=pic_link, headers=headers).content
     85     img_name = pic_link.split('/')[-1]
     86     imgPath = dir_name + '/' + img_name
     87     with open(imgPath, 'wb') as f:
     88         f.write(img_data)
     89     return
     90 
     91 
     92 # 网页主页生成器
     93 def create_main_url(url_num):
     94     url_ys = '子链接'
     95     mian_url_list = []
     96     if url_num > 1:
     97         start_num = 2
     98     else:
     99         start_num = 1
    100 
    101     for url_n in range(start_num, url_num + 1):
    102         if url_n != 1:
    103             url = url_ys + 'index_%d.html'
    104             new_url = format(url % url_n)
    105         else:
    106             new_url = url_ys
    107         mian_url_list.append(new_url)
    108     return mian_url_list
    109 
    110 
    111 # 子网页主页生成器
    112 def create_sec_url(url, url_num, *args, **kwargs):
    113     """
    114     :param url:
    115     :param url_num:
    116     :return:
    117     """
    118     sec_url_list = []
    119     for url_n in range(1, url_num + 1):
    120         if url_n != 1:
    121             # new_url = url + '_'+str(url_n)+'.html'
    122             begin = url.find("h")
    123             end = url.rfind(".")
    124             find_url = url[begin:end]
    125             new_url = find_url + '_' + str(url_n) + '.html'
    126         else:
    127             new_url = url
    128         sec_url_list.append(new_url)
    129     return sec_url_list
    130 
    131 
    132 # 下载日志生成
    133 def create_log(log_content):
    134     """
    135     下载日志生成函数
    136     :param log_content: 写入log的内容
    137     :return: 无
    138     """
    139     with open("log.txt", "a") as file:
    140         file.write(log_content)
    141     return
    142 
    143 
    144 # 页面记录器
    145 def page_record(page_num=0, *args, **kwargs):
    146     with open("page_record.txt", "w+") as file:
    147         file.write(page_num)
    148     return
    149 
    150 
    151 # 读取配置
    152 def page_read():
    153     with open("page_record.txt", "r") as file:
    154         r_page_num = file.readline()
    155     return r_page_num
    156 
    157 
    158 ############################################  爬虫工作区  ############################################
    159 
    160 n_yema = int(page_read())
    161 # print(n_yema)
    162 if polling_page_num > 361:
    163     print("您输入的超出轮询范围,请重新输入!")
    164 elif polling_page_num > n_yema:
    165     end_page_num = polling_page_num
    166     print("主程序即将进行")
    167 
    168     # 生成主网页
    169     mian_url_list_ys = create_main_url(end_page_num)
    170     mian_url_list = mian_url_list_ys[int(n_yema)-1:int(end_page_num)+1]
    171 
    172 
    173     for url in mian_url_list:
    174         n_yema = n_yema + 1
    175         sec_url_li = anasy_main_page(download_page(url)[0])  # 分析主链接,获得套图列表
    176         print(len(sec_url_li), sec_url_li)
    177         log_mian_start = "*" * 15 + "" + str(n_yema) + "页,开始下载-->" + url + "*" * 15
    178         print(log_mian_start)  # 某些页面开始下载的提示
    179         n_tao = 0
    180         for url_sec in sec_url_li[0:31]:
    181             n_tao = n_tao + 1
    182             dir_name = anasy_Secondary_page(download_page(url_sec, html_encode="utf-8")[0])[0]  # 分析指定的套图链接,获得套图的名称
    183 
    184 
    185             print("*" * 15 + "" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "--" + "开始下载" + "*" * 15)
    186             dir_name_sj = create_folder(dir_name)
    187             sec_url_list = create_sec_url(url_sec, 60)
    188             m = 0
    189             for pic_link in sec_url_list:
    190                 m = m + 1
    191                 page_text, response_status_pic = download_page(pic_link)
    192                 if response_status_pic == 200:
    193                     donw_pic_link = anasy_Secondary_page(page_text)[1]  # 分析指定的套图链接,获得图片链接
    194                     down_pic(dir_name_sj, donw_pic_link)
    195                     print("" + str(m) + "张图片下成功", donw_pic_link)
    196                     time.sleep(1)
    197                 else:
    198                     continue
    199 
    200             print("" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "全部图片下载完毕" + "
    ")
    201             log_text = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "---" + "" + str(
    202                 n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "已经下载完毕" + "
    "
    203             create_log(log_content=log_text)
    204         log_main_end = "*" * 10 + "" + str(n_yema) + "页,下载完成-->" + url + "*" * 10 + "
    
    "
    205         print(log_main_end)
    206         # 要记录已经下载的主页的页面页码(n_yema)
    207         page_record(str(n_yema))
    作者:小飞
    备注:本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须在文章页面给出原文连接,否则保留追究法律责任的权利。
    备注:部分图片下载来源于网络,如若有侵权,请联系本人予以删除,邮箱:2777418194@qq.com。
    本博客作为本人软件学习记录而用,不提供任何软件的下载链接,敬请谅解
    可关注本人微信公众号【软件开发部门】回复“资源”获取部分免费资源
  • 相关阅读:
    HDU 2767:Proving Equivalences(强连通)
    POJ 2828: Buy Tickets(线段树)
    HRBUST 2078:糖果(模拟,贪心)
    大视野 1016: [JSOI2008]最小生成树计数(最小生成树)
    HRBUST 2064:萌萌哒十五酱的宠物~(最近公共祖先LCA)
    HRBUST 2072:萌萌哒十五酱的礼物~(树,字典树)
    分组 F查询 Q 查询
    连表查询 复习
    django 多表操作
    orm 表单操作
  • 原文地址:https://www.cnblogs.com/xf23554/p/14779533.html
Copyright © 2011-2022 走看看