zoukankan      html  css  js  c++  java
  • python 爬虫

    #!/usr/bin/python3
    # -*- coding: UTF-8 -*-
     
    import urllib
    from urllib.parse import urlencode
    from urllib.request import Request, urlopen
    import re
    import time
    import os
    import mysql.connector
     
    times = 0

    def saveDownedurl(downedurl):
        url = downedurl
        conn = mysql.connector.connect(user='root', password='694521', database='picurl')
        cursor = conn.cursor()
        sql = "INSERT INTO downedurl (picurl) VALUES (%s)"
        cursor.execute(sql,[url])
        conn.commit() 
        print(cursor.rowcount, "记录插入成功。")
        conn.close()
        # sql = "INSERT INTO downedurl (picurl) VALUES (url)"
        # cursor.execute(sql)
        # conn.commit() 
        # print(cursor.rowcount, "记录插入成功。")
        # conn.close()


    def download_pic(pic_url,root_url,down_times):
         url = pic_url
         Referer = root_url
         down_time = down_times
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
         'Referer':Referer
         }
         down_path = str(down_time)+'.jpg'
         print (down_path)
         requests = Request(url, headers=headers)
         data = urlopen(requests).read()
         with open(down_path, 'wb') as f:
              f.write(data)
              f.close()
         down_time+=1
         return down_time




    def jiexi_rootPic_url(next_rootUrl,down_times):
         url = next_rootUrl
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
         }
         downtime = down_times
         request_url = Request(url, headers=headers)
         response = urlopen(request_url).read().decode("utf-8") 
         pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)
         pic_path =  pattern.findall(response)
         for i in pic_path:
              print ('download_prepare')
              downtime = download_pic(i,url,downtime) 
              print(i)
         time.sleep(2)
         return downtime


    def jiexi_url(root_url,down_times):
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
         }
         downtime = down_times
         url = root_url
         request_url = Request(url, headers=headers)
         html = urlopen(request_url).read().decode("utf-8") 
         response = re.compile('/rnyy(.*?).html', re.IGNORECASE)
         all_next_root =  response.findall(html)
         for i in all_next_root:
              path = 'http://mmff30.com/rnyy'+i+'.html'
              print (path)
              saveDownedurl(path)
              downtime = jiexi_rootPic_url(path,downtime)




    jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)
  • 相关阅读:
    Python分析44130条用户观影数据,挖掘用户与电影之间的隐藏信息!
    办公利器!用Python快速将任意文件转为PDF
    教你用python搭建一个「生活常识解答」机器人
    办公利器!用Python批量识别发票并录入到Excel表格
    遇到禁止复制该怎么办?幸好我会Python...
    通知:生物信息学云论坛第十五场报告会
    centos7设置SSH安全策略–指定IP登陆
    SpringMVC—RequestMapping注解参数说明
    SpringMVC-方法四种类型返回值总结,你用过几种?
    Window下:自带python编辑器的wxpython项目发布打包exe
  • 原文地址:https://www.cnblogs.com/ytCui/p/13055992.html
Copyright © 2011-2022 走看看