zoukankan      html  css  js  c++  java
  • Python爬虫

    python爬取36大数据文件代码

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import httplib
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
    
    #保存网页
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'w')
        file.write(html)
        file.close()
    
    #读文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
    
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    
    #html = getHtml(url)
    
    def crawlpage(url):
        page = urllib2.urlopen(url)
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    def crawler(url):
        article = crawlpage(url)
        for i in article:
            print i[0],i[1],i[2]
            html = getHtml(i[2])
            htmlname = i[2].split('/')[-1]
            d = pq(html)
            s = d('article').html()
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) 
            imgurl =  d('img').attr('src')
            #imgname = i[2].split('/')[-1]
            #print imgname
            ir = imgurl.encode("utf-8")
            #print ir
            urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')
            #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)
    
    url_base='http://www.36dsj.com/page/'
    for i in range(1,5):
        url = url_base+str(i)
        print url
        try :
           crawler(url)
        except:
           continue
    #url = 'http://www.36dsj.com/page/1'
    #crawler(url)

    Shell调用python爬取36大数据文章

    #!/bin/bash
    alias dt='date +%Y-%m-%d" "%H:%M:%S'
    shopt -s expand_aliases
    
    
    pypath=/usr/local/python2.7/bin
    
    dir=/etl/etldata/script
    
    a=`echo $RANDOM`
    
    a=`ls  ${dir}/tmpdir/image/ | wc -l`
    
    echo "ALL:${a}"
    
    num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    
    if [ ${num} -eq 0 ];then
       num=1
    fi
    
    #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    echo ${num}
    
    #num=1
    
    
    #dir=/etl/etldata/script
    
    
    $pypath/python /${dir}/tmpdir/crawler_36.py 
    
    ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp
    
    #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'`
    
    id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1`
      
    echo  "`dt`:${id}"
      
    $pypath/python ${dir}/tmpdir/mail.py ${id}

     python爬取网页

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import requests
    import json
    import chardet
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    
    
    def crawlpage(url):
        page = urllib2.urlopen("http://www.36dsj.com/")
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    #通过url获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
        
    def get(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        s1 = sys.getfilesystemencoding()
        s2 = chardet.detect(html)['encoding'] 
        print s1,s2
        #print html
        return html.decode(s2).encode(s1)
    
    #保存网页内容
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'a')
        file.write(html)
        file.close()
    
    #读网页文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
        
        
    def GetHtml(url):  
        page = urllib.urlopen(url)  
        contex = page.read()  
        return contex  
    
    
    
    def GetData(cont):
        data = pq(cont)
        #日期:
        date_id = data('td').eq(0).text()
        code_id = data('td').eq(1).text()
        A = data('td').eq(2).text()
        s = date_id+' '+code_id+' '+A+'
    '
        return s
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'
    url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'
    url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'
    #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
    #url = 'http://hq.sinajs.cn/list=sh601006'
    
    def craw_pk10(date_id):
        url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id
        html =getHtml(url)
        d = pq(html)
        data=d('table').filter('#draw_list').find('tbody').find('tr')
        print data.length
        for i in range(data.length):
            s = GetData(data.eq(i))
            a = open('/home/hadoop/python/PK10_20170610.csv','a')
            a.write(s)
            a.close()
        #saveHtml('/home/hadoop/python/2017-06-10.csv',s)
  • 相关阅读:
    h5页面页面在iphoneX手机上底部会有留白解决办法
    自定义单张图片放大预览功能,可支持手势缩放,依赖jquery
    js事件内部有click事件时,click事件会重复调用解决方法
    h5页面通过阿里云的broswer-js-sdk上传文件
    python字符串前加r、f、u、l 的区别
    Python基础面试题 :计算列表中出现最多次的字符
    python基础入门教程:传参是传值还是传引用
    Python 面试题:输入一个数组,输出该数组的第二大的数字
    Python 7种超实用的数据清洗方法,这你一定要掌握
    python教程:3个非常有用的内置函数(filter/map/reduce)
  • 原文地址:https://www.cnblogs.com/Jims2016/p/7134753.html
Copyright © 2011-2022 走看看