zoukankan      html  css  js  c++  java
  • python 爬虫爬取历年双色球开奖信息

     目前写的这些爬虫都是些静态网页,对于一些高级网页(像经过JS渲染过的页面),目前技术并不能解决,自己也是在慢慢学习过程中,如有错误,欢迎指正;

    对面前端知识本人并不懂,过程中如果涉及到前端知识,也是百度而来,毕竟爬虫还是和前端页面打交道多,前端知识还是要多学习; 

    此篇还是继续静态页面,更换了不同的内容,以及涉及到多个python 模块和自己二次封装的模块,个人感觉这些模块不使用在爬虫方面也是很有用的;

    第一部分,封装了自带模块logging,其中使用了getpass 模块,用来记录日志的用户名,都是些简单的使用,关于注释,本来已写好,并未上传到git,导致此次上传代码没有下次注意,哈哈;

    个人建议:在学习python 过程中多练习写代码,在写的过程中去理解其中的用法;

    #!/usr/bin/env python
    #coding:utf-8
    #author chenjisheng
    #date 20171129
    import logging
    import getpass
    
    
    class MyLog(object):
        "this class will create log"
        def __init__(self):
            user = getpass.getuser()          
            self.logger = logging.getLogger(user)
            self.logger.setLevel(logging.DEBUG)
            logFile = './progress.log'
            formatter = logging.Formatter(
                '%(asctime) -12s %(levelname)-8s %(name) -10s %(message)-12s'
            )
            '''logfile output screen and files'''
            logHand = logging.FileHandler(logFile)
            logHand.setFormatter(formatter)
            logHand.setLevel(logging.ERROR)
            logHandt = logging.StreamHandler()
            logHandt.setFormatter(formatter)
            self.logger.addHandler(logHand)
            self.logger.addHandler(logHandt)
    
            '''five level and five functions '''
        def debug(self,msg):
            self.logger.debug(msg)
    
        def info(self,msg):
            self.logger.info(msg)
    
        def warn(self,msg):
            self.logger.warning(msg)
    
        def error(self,msg):
            self.logger.error(msg)
    
        def critical(self,msg):
            self.logger.critical(msg)
    
    if __name__ == "__main__":
        mylog = MyLog()
        mylog.debug('i am debug')
        mylog.info('i am info')
        mylog.warn('i am warning')
        mylog.error('i am error')
        mylog.critical('i am critical')
    

     第二部分:使用了re,urllib2,xlwt,bs4,sys模块;xlwt模块在之前的博客里已写过;bs4 模块大名鼎鼎,不过多解析,至于为什么用它,因为其简单,其它的爬虫模块也不会;也在学习当中;

    #!/usr/bin/env python
    #coding:utf-8
    """Created on 2017-11-29"""
    
    import re
    import urllib2
    import xlwt
    from bs4 import BeautifulSoup
    from myLog import MyLog as mylog
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    class DoubleColorBallItem(object):
        date = None
        order = None
        red1 = None
        red2 = None
        red3 = None
        red4 = None
        red5 = None
        red6 = None
        bule = None
        money = None
        firstPrize = None
        secondPrize = None
    
    class GetDoubleColorBallNumber(object):
        """capture BallNumbers"""
    
        def __init__(self):
            self.urls = []
            self.log = mylog()
            self.getUrls()
            self.items = self.spider(self.urls)
            self.pipelines(self.items,self)
    
    
        def getUrls(self):
            URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
            htmlContent = self.getResponseContent(URL)
            soup = BeautifulSoup(htmlContent,'lxml')
            tag = soup.find_all(re.compile('p'))[-1]
            # pages = tag.strong.get_text()
            pages = 2
            for i in xrange(1,int(pages)+1):
                url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
                self.urls.append(url)
                self.log.info(u'append URL:%s to URLS 
    '%url)
    
        def getResponseContent(self,url):
            try:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
                }
                req = urllib2.Request(url,headers=headers)
                response = urllib2.urlopen(req)
            except Exception,e:
                self.log.error(u'return datas failed URL:%s
    '%url)
            else:
                self.log.info(u'return datas successfuly URL:%s
    '%url)
                return response.read()
    
        def spider(self,urls):
            items = []
            for url in urls:
                htmlContent = self.getResponseContent(url)
                soup = BeautifulSoup(htmlContent,'lxml')
                tags = soup.find_all('tr',attrs={})
                for tag in tags:
                    if tag.find('em'):
                        item = DoubleColorBallItem()
                        tagTd = tag.find_all('td')
                        item.date = tagTd[0].get_text()
                        item.order = tagTd[1].get_text()
                        tagEm = tagTd[2].find_all('em')
                        item.red1 = tagEm[0].get_text()
                        item.red2 = tagEm[1].get_text()
                        item.red3 = tagEm[2].get_text()
                        item.red4 = tagEm[3].get_text()
                        item.red5 = tagEm[4].get_text()
                        item.red6 = tagEm[5].get_text()
                        item.bule = tagEm[6].get_text()
                        item.money = tagTd[3].find('strong').get_text()
                        item.firstPrize = tagTd[4].find('strong').get_text()
                        item.secondPrize = tagTd[5].find('strong').get_text()
                        items.append(item)
                        self.log.info(u'get date:%s datas OK
    '%item.date)
            return items
    
        def pipelines(self,items,nu):
            # fileName = 'DoubleBall.txt'
            # with open(fileName,'w') as fp:
            #     for item in items:
            #         fp.write('%s %s 	 %s %s %s %s %s %s  	 %s 	  %s   %s 
    '%(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,
            #                                                                   item.red6,item.bule,item.firstPrize,item.secondPrize))
            #         self.log.info(u'write date:%s OK '%item.date)
            W = xlwt.Workbook('utf-8')
            ws = W.add_sheet(u"双色球记录")
            # ws.col(1).width = 6666
            # ws.col(2).width = 3333
            ws.write(0,1,label=u"时间")
            ws.write(0,2,label=u"期号")
            ws.write(0,3, label=u"红色1")
            ws.write(0,4, label=u"红色2")
            ws.write(0,5, label=u"红色3")
            ws.write(0,6, label=u"红色4")
            ws.write(0,7, label=u"红色5")
            ws.write(0,8, label=u"红色6")
            ws.write(0,9, label=u"蓝色")
            ws.write(0,10, label=u"一等奖")
            ws.write(0,11, label=u"二等奖")
            nu = 1
            for item in items:
                ws.write(nu,1,label=item.date)
                ws.write(nu,2,label=item.order)
                ws.write(nu,3,label=item.red1)
                ws.write(nu,4,label=item.red2)
                ws.write(nu,5,label=item.red3)
                ws.write(nu,6,label=item.red4)
                ws.write(nu,7,label=item.red5)
                ws.write(nu,8,label=item.red6)
                ws.write(nu,9,label=item.bule)
                ws.write(nu,10,label=item.firstPrize)
                ws.write(nu,11,label=item.secondPrize)
                nu += 1
            W.save(u"双色球记录表.xls")
    if __name__ == '__main__':
        GDCBN = GetDoubleColorBallNumber()
    

     以上部分,也是学习了别人经验,也从代码中学到了不少知识,愿本文也能给你带来灵感;

  • 相关阅读:
    HDU 1850 Being a Good Boy in Spring Festival
    UESTC 1080 空心矩阵
    HDU 2491 Priest John's Busiest Day
    UVALive 6181
    ZOJ 2674 Strange Limit
    UVA 12532 Interval Product
    UESTC 1237 质因子分解
    UESTC 1014 Shot
    xe5 android listbox的 TMetropolisUIListBoxItem
    xe5 android tts(Text To Speech)
  • 原文地址:https://www.cnblogs.com/Mail-maomao/p/7955389.html
Copyright © 2011-2022 走看看