zoukankan      html  css  js  c++  java
  • 第一个爬虫代码

    # !/usr/bin/python
    #coding=GBK
    import urllib.request
    import re


    #file=open("F:/python_workspace/爬虫/图片/0.jpg","wb")
    #url="http://desk.zol.com.cn/2560x1600/"
    def gethtml(url):
    header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0"}
    req=urllib.request.Request(url,headers=header)
    res=urllib.request.urlopen(req)
    html=res.read()
    return html

    def getcata(html,reg):
    #reg = r'href="(/[a-z]+/.*?2560x1600/)'
    imgre = re.compile(reg)
    #html0 = gethtml("http://desk.zol.com.cn/2560x1600/").decode('utf-8')
    cata_list=imgre.findall(html)
    return cata_list

    def geturl(url):
    url=url
    html=gethtml(url)
    urllist=getcata(html.decode("GBK"),r'imgsrc":"(http:.*?.png|http:.*?.jpg)')
    return urllist

    def getpicurl(picurl,num):
    file = open('F:/python_workspace/爬虫/图片/'+str(num)+'.png', "wb")
    reg1 = r'(\)'
    reg2 = r'(##SIZE##)'
    imgre1 = re.compile(reg1)
    res1, num1 = imgre1.subn("", picurl)

    imgre2 = re.compile(reg2)
    res2, num2 = imgre2.subn("2560x1600", res1)
    # x=0
    res=urllib.request.urlopen(res2,timeout=10)
    res=res.read()
    data=file.write(res)
    file.close()





    #html=gethtml("http://desk.zol.com.cn/2560x1600/").decode('GBK')
    #cata_list=getcata(html,r'href="(/[a-z]+/.*?2560x1600/)')
    #for i in cata_list:
    # geturl(i)
    #ss="http://desk.fd.zol-img.com.cn/t_s##SIZE##/g5/M00/0D/03/ChMkJlmVBaOIK26rAAJ3foZd400AAfwAADpPesAAneW914.jpg"
    #getpicurl(ss)
    domain="http://desk.zol.com.cn"
    count=0
    for urlcount in range(1,47):
    url='http://desk.zol.com.cn/2560x1600/'+str(urlcount)+'.html'
    try:
    html=gethtml(url).decode('GBK')
    cata_list=getcata(html,r'href="(/bizhi/.*?.html)" target="_blank" hidefocus="true"><img width="208px"')
    except:
    print ("gethtml method error!")
    continue

    for i in cata_list:
    i = domain + i;
    try:
    picurllist=geturl(i)
    except:
    print("picurllist method error!")
    continue

    for j in picurllist:
    try:
    getpicurl(j,count)
    except:
    print("getpicurl method error!")
    continue
    count=count+1
    print (j)
  • 相关阅读:
    Struts2(五)——核心拦截器
    Struts2(四)——页面相关内容
    Struts2(三)——数据在框架中的数据流转问题
    Python Day 1
    c++-STL:删除子串
    九度1165:字符串匹配
    九度1051:数字阶梯求和
    数据结构之二叉树基础三
    数据结构之二叉树基础二
    数据结构之二叉树基础一
  • 原文地址:https://www.cnblogs.com/perTest/p/7635298.html
Copyright © 2011-2022 走看看