zoukankan      html  css  js  c++  java
  • Python爬虫

    python爬取36大数据文件代码

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import httplib
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
    
    #保存网页
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'w')
        file.write(html)
        file.close()
    
    #读文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
    
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    
    #html = getHtml(url)
    
    def crawlpage(url):
        page = urllib2.urlopen(url)
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    def crawler(url):
        article = crawlpage(url)
        for i in article:
            print i[0],i[1],i[2]
            html = getHtml(i[2])
            htmlname = i[2].split('/')[-1]
            d = pq(html)
            s = d('article').html()
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) 
            imgurl =  d('img').attr('src')
            #imgname = i[2].split('/')[-1]
            #print imgname
            ir = imgurl.encode("utf-8")
            #print ir
            urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')
            #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)
    
    url_base='http://www.36dsj.com/page/'
    for i in range(1,5):
        url = url_base+str(i)
        print url
        try :
           crawler(url)
        except:
           continue
    #url = 'http://www.36dsj.com/page/1'
    #crawler(url)

    Shell调用python爬取36大数据文章

    #!/bin/bash
    alias dt='date +%Y-%m-%d" "%H:%M:%S'
    shopt -s expand_aliases
    
    
    pypath=/usr/local/python2.7/bin
    
    dir=/etl/etldata/script
    
    a=`echo $RANDOM`
    
    a=`ls  ${dir}/tmpdir/image/ | wc -l`
    
    echo "ALL:${a}"
    
    num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    
    if [ ${num} -eq 0 ];then
       num=1
    fi
    
    #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    echo ${num}
    
    #num=1
    
    
    #dir=/etl/etldata/script
    
    
    $pypath/python /${dir}/tmpdir/crawler_36.py 
    
    ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp
    
    #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'`
    
    id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1`
      
    echo  "`dt`:${id}"
      
    $pypath/python ${dir}/tmpdir/mail.py ${id}

     python爬取网页

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import requests
    import json
    import chardet
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    
    
    def crawlpage(url):
        page = urllib2.urlopen("http://www.36dsj.com/")
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    #通过url获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
        
    def get(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        s1 = sys.getfilesystemencoding()
        s2 = chardet.detect(html)['encoding'] 
        print s1,s2
        #print html
        return html.decode(s2).encode(s1)
    
    #保存网页内容
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'a')
        file.write(html)
        file.close()
    
    #读网页文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
        
        
    def GetHtml(url):  
        page = urllib.urlopen(url)  
        contex = page.read()  
        return contex  
    
    
    
    def GetData(cont):
        data = pq(cont)
        #日期:
        date_id = data('td').eq(0).text()
        code_id = data('td').eq(1).text()
        A = data('td').eq(2).text()
        s = date_id+' '+code_id+' '+A+'
    '
        return s
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'
    url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'
    url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'
    #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
    #url = 'http://hq.sinajs.cn/list=sh601006'
    
    def craw_pk10(date_id):
        url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id
        html =getHtml(url)
        d = pq(html)
        data=d('table').filter('#draw_list').find('tbody').find('tr')
        print data.length
        for i in range(data.length):
            s = GetData(data.eq(i))
            a = open('/home/hadoop/python/PK10_20170610.csv','a')
            a.write(s)
            a.close()
        #saveHtml('/home/hadoop/python/2017-06-10.csv',s)
  • 相关阅读:
    Spring Boot(二十):使用spring-boot-admin对spring-boot服务进行监控
    spring Boot(十九):使用Spring Boot Actuator监控应用
    Spring Boot(十八):使用Spring Boot集成FastDFS
    Spring Boot(十七):使用Spring Boot上传文件
    Spring Boot(十六):使用Jenkins部署Spring Boot
    Spring Boot(十五):spring boot+jpa+thymeleaf增删改查示例
    Spring Boot(十四):spring boot整合shiro-登录认证和权限管理
    ubuntu18.04使用vscode报pylint is not install错误
    处理程序“PageHandlerFactory-Integrated”在其模块列表中有一个错误模块“Manag
    在SQL Server中,为何都建议禁止 VIA 协议,VIA协议具体内容是什么?
  • 原文地址:https://www.cnblogs.com/Jims2016/p/7134753.html
Copyright © 2011-2022 走看看