zoukankan      html  css  js  c++  java
  • python 爬虫

    #!/usr/bin/python3
    # -*- coding: UTF-8 -*-
     
    import urllib
    from urllib.parse import urlencode
    from urllib.request import Request, urlopen
    import re
    import time
    import os
    import mysql.connector
     
    times = 0

    def saveDownedurl(downedurl):
        url = downedurl
        conn = mysql.connector.connect(user='root', password='694521', database='picurl')
        cursor = conn.cursor()
        sql = "INSERT INTO downedurl (picurl) VALUES (%s)"
        cursor.execute(sql,[url])
        conn.commit() 
        print(cursor.rowcount, "记录插入成功。")
        conn.close()
        # sql = "INSERT INTO downedurl (picurl) VALUES (url)"
        # cursor.execute(sql)
        # conn.commit() 
        # print(cursor.rowcount, "记录插入成功。")
        # conn.close()


    def download_pic(pic_url,root_url,down_times):
         url = pic_url
         Referer = root_url
         down_time = down_times
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
         'Referer':Referer
         }
         down_path = str(down_time)+'.jpg'
         print (down_path)
         requests = Request(url, headers=headers)
         data = urlopen(requests).read()
         with open(down_path, 'wb') as f:
              f.write(data)
              f.close()
         down_time+=1
         return down_time




    def jiexi_rootPic_url(next_rootUrl,down_times):
         url = next_rootUrl
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
         }
         downtime = down_times
         request_url = Request(url, headers=headers)
         response = urlopen(request_url).read().decode("utf-8") 
         pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)
         pic_path =  pattern.findall(response)
         for i in pic_path:
              print ('download_prepare')
              downtime = download_pic(i,url,downtime) 
              print(i)
         time.sleep(2)
         return downtime


    def jiexi_url(root_url,down_times):
         headers = {
         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
         }
         downtime = down_times
         url = root_url
         request_url = Request(url, headers=headers)
         html = urlopen(request_url).read().decode("utf-8") 
         response = re.compile('/rnyy(.*?).html', re.IGNORECASE)
         all_next_root =  response.findall(html)
         for i in all_next_root:
              path = 'http://mmff30.com/rnyy'+i+'.html'
              print (path)
              saveDownedurl(path)
              downtime = jiexi_rootPic_url(path,downtime)




    jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)
  • 相关阅读:
    (BFS 二叉树) leetcode 515. Find Largest Value in Each Tree Row
    (二叉树 BFS) leetcode513. Find Bottom Left Tree Value
    (二叉树 BFS DFS) leetcode 104. Maximum Depth of Binary Tree
    (二叉树 BFS DFS) leetcode 111. Minimum Depth of Binary Tree
    (BFS) leetcode 690. Employee Importance
    (BFS/DFS) leetcode 200. Number of Islands
    (最长回文子串 线性DP) 51nod 1088 最长回文子串
    (链表 importance) leetcode 2. Add Two Numbers
    (链表 set) leetcode 817. Linked List Components
    (链表 双指针) leetcode 142. Linked List Cycle II
  • 原文地址:https://www.cnblogs.com/ytCui/p/13055992.html
Copyright © 2011-2022 走看看