zoukankan      html  css  js  c++  java
  • Python爬虫

    python爬取36大数据文件代码

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import httplib
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    #获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
    
    #保存网页
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'w')
        file.write(html)
        file.close()
    
    #读文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
    
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    
    #html = getHtml(url)
    
    def crawlpage(url):
        page = urllib2.urlopen(url)
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    def crawler(url):
        article = crawlpage(url)
        for i in article:
            print i[0],i[1],i[2]
            html = getHtml(i[2])
            htmlname = i[2].split('/')[-1]
            d = pq(html)
            s = d('article').html()
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)
            saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) 
            imgurl =  d('img').attr('src')
            #imgname = i[2].split('/')[-1]
            #print imgname
            ir = imgurl.encode("utf-8")
            #print ir
            urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')
            #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)
    
    url_base='http://www.36dsj.com/page/'
    for i in range(1,5):
        url = url_base+str(i)
        print url
        try :
           crawler(url)
        except:
           continue
    #url = 'http://www.36dsj.com/page/1'
    #crawler(url)

    Shell调用python爬取36大数据文章

    #!/bin/bash
    alias dt='date +%Y-%m-%d" "%H:%M:%S'
    shopt -s expand_aliases
    
    
    pypath=/usr/local/python2.7/bin
    
    dir=/etl/etldata/script
    
    a=`echo $RANDOM`
    
    a=`ls  ${dir}/tmpdir/image/ | wc -l`
    
    echo "ALL:${a}"
    
    num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    
    if [ ${num} -eq 0 ];then
       num=1
    fi
    
    #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d
    ",sum)}'`
    echo ${num}
    
    #num=1
    
    
    #dir=/etl/etldata/script
    
    
    $pypath/python /${dir}/tmpdir/crawler_36.py 
    
    ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp
    
    #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'`
    
    id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1`
      
    echo  "`dt`:${id}"
      
    $pypath/python ${dir}/tmpdir/mail.py ${id}

     python爬取网页

    # -*- coding:UTF-8 -*-
    import urllib2
    import re
    import os
    import time
    from sgmllib import SGMLParser
    from pyquery import PyQuery as pq
    from lxml import etree
    import urllib
    import sys 
    import requests
    import json
    import chardet
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    
    
    
    def crawlpage(url):
        page = urllib2.urlopen("http://www.36dsj.com/")
        text = unicode(page.read(),"utf-8")
        d = pq(text)
        bloglist = []
        for i in range(d('article').filter('.excerpt').length):
            content = d('article').filter('.excerpt').eq(i).html()
            a = resolveBlog(content)
            #print a[5]
            bloglist.append(a)
        return bloglist
    
    #通过url获取网页
    def getHtml(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = unicode(response.read(),'utf-8')
        return html
        
    def get(url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        s1 = sys.getfilesystemencoding()
        s2 = chardet.detect(html)['encoding'] 
        print s1,s2
        #print html
        return html.decode(s2).encode(s1)
    
    #保存网页内容
    def saveHtml(filepath,html):
        file_dir = os.path.split(filepath)[0]
        if not os.path.isdir(file_dir):
           os.makedirs(file_dir)
        if os.path.exists(filepath):
           os.remove(filepath)
        file = open(filepath,'a')
        file.write(html)
        file.close()
    
    #读网页文件
    def readHtml(filepath):
         f  = open(filepath,'rb')
         return unicode(f.read(),'utf-8')
        
    
    #解析网页
    def resolveHtml(filepath):
        d = pq(filename=filepath)
        return d
        
    
    def resolveBlog(content):
        d_cont = pq(content)
        #文章标题
        title = d_cont('h2').text()
        #文章href
        href = d_cont('h2').find('a').attr('href')
        #文章ID
        id = href.split('/')[-1]
        #作者 时间 类别
        au_tm_cat = d_cont('p').filter('.info').text()
        author = au_tm_cat.split()[0]
        date = au_tm_cat.split()[1]
        cat = au_tm_cat.split()[2]
        #内容
        note = d_cont('p').filter('.note').text()
        blog = [id,title,href,author,date,cat,note]
        return blog
        
        
    def GetHtml(url):  
        page = urllib.urlopen(url)  
        contex = page.read()  
        return contex  
    
    
    
    def GetData(cont):
        data = pq(cont)
        #日期:
        date_id = data('td').eq(0).text()
        code_id = data('td').eq(1).text()
        A = data('td').eq(2).text()
        s = date_id+' '+code_id+' '+A+'
    '
        return s
    url = 'http://www.36dsj.com/'
    url = 'http://www.91333.com/pk10/'
    url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'
    url = 'http://www.36dsj.com/'
    url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'
    url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'
    url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'
    #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
    #url = 'http://hq.sinajs.cn/list=sh601006'
    
    def craw_pk10(date_id):
        url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id
        html =getHtml(url)
        d = pq(html)
        data=d('table').filter('#draw_list').find('tbody').find('tr')
        print data.length
        for i in range(data.length):
            s = GetData(data.eq(i))
            a = open('/home/hadoop/python/PK10_20170610.csv','a')
            a.write(s)
            a.close()
        #saveHtml('/home/hadoop/python/2017-06-10.csv',s)
  • 相关阅读:
    服务器上的 Git
    进程有哪几种基本状态,在一个系统中为什么必须区分出这几种状态?
    什么是进程,进程与程序的主要区别是什么?
    什么是与时间有关的错误,是举例说明。
    试解释下列名词:程序的顺序执行,程序的并发执行。
    简述系统调用的执行过程?
    假定某系统提供硬件的访管指令(例如形式:“svc n”),为了实现系统调用,系统设计者应做哪些工作?用户又如如何请求操作系统服务?
    什么是系统调用,对操作系统的服务请求与一般子程序调用有什么区别?
    Windows系统提供什么样的接口,Unix、Linux系统的用户接口是什么?
    用户与操作系统的接口是什么?一个分时系统提供什么接口?一个批处理系统又提供什么接口?
  • 原文地址:https://www.cnblogs.com/Jims2016/p/7134753.html
Copyright © 2011-2022 走看看