zoukankan      html  css  js  c++  java
  • Python爬虫

    python爬取36大数据文件代码

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import httplib
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
    
    #保存网页
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'w')
        file.write(html)
        file.close()
    
    #读文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
    
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    
    #html = getHtml(url)
    
    def crawlpage(url):
        page = urllib2.urlopen(url)
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    def crawler(url):
        article = crawlpage(url)
        for i in article:
            print i[0],i[1],i[2]
            html = getHtml(i[2])
            htmlname = i[2].split('/')[-1]
            d = pq(html)
            s = d('article').html()
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) 
            imgurl =  d('img').attr('src')
            #imgname = i[2].split('/')[-1]
            #print imgname
            ir = imgurl.encode("utf-8")
            #print ir
            urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')
            #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)
    
    url_base='http://www.36dsj.com/page/'
    for i in range(1,5):
        url = url_base+str(i)
        print url
        try :
           crawler(url)
        except:
           continue
    #url = 'http://www.36dsj.com/page/1'
    #crawler(url)

    Shell调用python爬取36大数据文章

    #!/bin/bash
    alias dt='date +%Y-%m-%d" "%H:%M:%S'
    shopt -s expand_aliases
    
    
    pypath=/usr/local/python2.7/bin
    
    dir=/etl/etldata/script
    
    a=`echo $RANDOM`
    
    a=`ls  ${dir}/tmpdir/image/ | wc -l`
    
    echo "ALL:${a}"
    
    num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    
    if [ ${num} -eq 0 ];then
       num=1
    fi
    
    #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    echo ${num}
    
    #num=1
    
    
    #dir=/etl/etldata/script
    
    
    $pypath/python /${dir}/tmpdir/crawler_36.py 
    
    ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp
    
    #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'`
    
    id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1`
      
    echo  "`dt`:${id}"
      
    $pypath/python ${dir}/tmpdir/mail.py ${id}

     python爬取网页

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import requests
    import json
    import chardet
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    
    
    def crawlpage(url):
        page = urllib2.urlopen("http://www.36dsj.com/")
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    #通过url获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
        
    def get(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        s1 = sys.getfilesystemencoding()
        s2 = chardet.detect(html)['encoding'] 
        print s1,s2
        #print html
        return html.decode(s2).encode(s1)
    
    #保存网页内容
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'a')
        file.write(html)
        file.close()
    
    #读网页文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
        
        
    def GetHtml(url):  
        page = urllib.urlopen(url)  
        contex = page.read()  
        return contex  
    
    
    
    def GetData(cont):
        data = pq(cont)
        #日期:
        date_id = data('td').eq(0).text()
        code_id = data('td').eq(1).text()
        A = data('td').eq(2).text()
        s = date_id+' '+code_id+' '+A+'
    '
        return s
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'
    url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'
    url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'
    #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
    #url = 'http://hq.sinajs.cn/list=sh601006'
    
    def craw_pk10(date_id):
        url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id
        html =getHtml(url)
        d = pq(html)
        data=d('table').filter('#draw_list').find('tbody').find('tr')
        print data.length
        for i in range(data.length):
            s = GetData(data.eq(i))
            a = open('/home/hadoop/python/PK10_20170610.csv','a')
            a.write(s)
            a.close()
        #saveHtml('/home/hadoop/python/2017-06-10.csv',s)
  • 相关阅读:
    linux 安装mysql及配置
    django restframework的应用
    python uuid的连接及简单应用
    Flink开发-Flink的计算模型和接口
    数据仓库-基本框架和内容
    数据仓库-需求沟通和开发示例
    Spark开发-开发总览
    Hive 高阶应用开发示例(二)
    Hive 高阶应用开发示例(一)
    Spark开发-关联分析
  • 原文地址:https://www.cnblogs.com/Jims2016/p/7134753.html
Copyright © 2011-2022 走看看