zoukankan      html  css  js  c++  java
  • Python爬虫

    python爬取36大数据文件代码

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import httplib
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
    
    #保存网页
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'w')
        file.write(html)
        file.close()
    
    #读文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
    
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    
    #html = getHtml(url)
    
    def crawlpage(url):
        page = urllib2.urlopen(url)
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    def crawler(url):
        article = crawlpage(url)
        for i in article:
            print i[0],i[1],i[2]
            html = getHtml(i[2])
            htmlname = i[2].split('/')[-1]
            d = pq(html)
            s = d('article').html()
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) 
            imgurl =  d('img').attr('src')
            #imgname = i[2].split('/')[-1]
            #print imgname
            ir = imgurl.encode("utf-8")
            #print ir
            urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')
            #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)
    
    url_base='http://www.36dsj.com/page/'
    for i in range(1,5):
        url = url_base+str(i)
        print url
        try :
           crawler(url)
        except:
           continue
    #url = 'http://www.36dsj.com/page/1'
    #crawler(url)

    Shell调用python爬取36大数据文章

    #!/bin/bash
    alias dt='date +%Y-%m-%d" "%H:%M:%S'
    shopt -s expand_aliases
    
    
    pypath=/usr/local/python2.7/bin
    
    dir=/etl/etldata/script
    
    a=`echo $RANDOM`
    
    a=`ls  ${dir}/tmpdir/image/ | wc -l`
    
    echo "ALL:${a}"
    
    num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    
    if [ ${num} -eq 0 ];then
       num=1
    fi
    
    #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    echo ${num}
    
    #num=1
    
    
    #dir=/etl/etldata/script
    
    
    $pypath/python /${dir}/tmpdir/crawler_36.py 
    
    ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp
    
    #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'`
    
    id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1`
      
    echo  "`dt`:${id}"
      
    $pypath/python ${dir}/tmpdir/mail.py ${id}

     python爬取网页

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import requests
    import json
    import chardet
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    
    
    def crawlpage(url):
        page = urllib2.urlopen("http://www.36dsj.com/")
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    #通过url获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
        
    def get(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        s1 = sys.getfilesystemencoding()
        s2 = chardet.detect(html)['encoding'] 
        print s1,s2
        #print html
        return html.decode(s2).encode(s1)
    
    #保存网页内容
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'a')
        file.write(html)
        file.close()
    
    #读网页文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
        
        
    def GetHtml(url):  
        page = urllib.urlopen(url)  
        contex = page.read()  
        return contex  
    
    
    
    def GetData(cont):
        data = pq(cont)
        #日期:
        date_id = data('td').eq(0).text()
        code_id = data('td').eq(1).text()
        A = data('td').eq(2).text()
        s = date_id+' '+code_id+' '+A+'
    '
        return s
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'
    url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'
    url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'
    #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
    #url = 'http://hq.sinajs.cn/list=sh601006'
    
    def craw_pk10(date_id):
        url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id
        html =getHtml(url)
        d = pq(html)
        data=d('table').filter('#draw_list').find('tbody').find('tr')
        print data.length
        for i in range(data.length):
            s = GetData(data.eq(i))
            a = open('/home/hadoop/python/PK10_20170610.csv','a')
            a.write(s)
            a.close()
        #saveHtml('/home/hadoop/python/2017-06-10.csv',s)
  • 相关阅读:
    kettle Switch / Case Verification fails on working Tranfsormation
    KETTLE 并行任务设置
    查看ORACLE各表空间使用情况
    KETTLE使用小记
    ORACLE IMP三种模式
    倒腾ORACLE小记
    KETTLE xls表格导入
    绕点旋转(老问题)
    TransformToolControl.as
    钻石恒久远,一颗永流传
  • 原文地址:https://www.cnblogs.com/Jims2016/p/7134753.html
Copyright © 2011-2022 走看看