python 爬虫 - 走看看

zoukankan html css js c++ java

python 爬虫

#!/usr/bin/python3

# -*- coding: UTF-8 -*-

import urllib

from urllib.parse import urlencode

from urllib.request import Request, urlopen

import re

import time

import os

import mysql.connector

times = 0

def saveDownedurl(downedurl):

    url = downedurl

    conn = mysql.connector.connect(user='root', password='694521', database='picurl')

    cursor = conn.cursor()

    sql = "INSERT INTO downedurl (picurl) VALUES (%s)"

    cursor.execute(sql,[url])

    conn.commit()

    print(cursor.rowcount, "记录插入成功。")

    conn.close()

    # sql = "INSERT INTO downedurl (picurl) VALUES (url)"

    # cursor.execute(sql)

    # conn.commit()

    # print(cursor.rowcount, "记录插入成功。")

    # conn.close()

def download_pic(pic_url,root_url,down_times):

     url = pic_url

     Referer = root_url

     down_time = down_times

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',

     'Referer':Referer

     }

     down_path = str(down_time)+'.jpg'

     print (down_path)

     requests = Request(url, headers=headers)

     data = urlopen(requests).read()

     with open(down_path, 'wb') as f:

          f.write(data)

          f.close()

     down_time+=1

     return down_time

def jiexi_rootPic_url(next_rootUrl,down_times):

     url = next_rootUrl

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

     }

     downtime = down_times

     request_url = Request(url, headers=headers)

     response = urlopen(request_url).read().decode("utf-8")

     pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)

     pic_path =  pattern.findall(response)

     for i in pic_path:

          print ('download_prepare')

          downtime = download_pic(i,url,downtime)

          print(i)

     time.sleep(2)

     return downtime

def jiexi_url(root_url,down_times):

     headers = {

     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

     }

     downtime = down_times

     url = root_url

     request_url = Request(url, headers=headers)

     html = urlopen(request_url).read().decode("utf-8")

     response = re.compile('/rnyy(.*?).html', re.IGNORECASE)

     all_next_root =  response.findall(html)

     for i in all_next_root:

          path = 'http://mmff30.com/rnyy'+i+'.html'

          print (path)

          saveDownedurl(path)

          downtime = jiexi_rootPic_url(path,downtime)

jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)

查看全文

相关阅读:
console报错：No mapping found for HTTP request with URI(xxx)
jquery,ajax详解
 spring-mvc.xml 和 application-context.xml的区别
 robot
Linux记录history命令
 防火墙知识小结
 wireshark语法小结
 https和证书小结
 生成自签名证书：生成证书和秘钥
 查看win信任的证书办法机构（CA机构的公钥）

原文地址：https://www.cnblogs.com/ytCui/p/13055992.html