zoukankan      html  css  js  c++  java
  • 2017.08.17 Python爬虫实战之BS4获取双色球中奖信息

    1.目标分析:

    (1)网址:http://www.zhcw.com/ssq/kaijiangshuju/index.shtml?type=0

    (2)查看框架的源代码:在中奖信息表格上右击,选择弹出菜单中的“查看框架的源代码”:

    (3)右击下一页的链接,再次查看框架源代码,新的框架源代码是:kaijiang.zhcw.com/zhcw/html/ssq/list_2.html

     

    (4)大致明白了,URL的变化规律了,变化的只有list.html这部分,表格中的每一行的数据都包含在一对<tr>标签内,所以在写爬虫时只需先将<tr>标签挑选出来,然后再到其中过滤数据就可以了;

    2.项目实施

    (1)项目文件结构:

    (2)getWinningNum.py文件内容:

    #! /usr/bin/env python
    #-*- coding:utf-8 -*-

    import re
    from bs4 import BeautifulSoup
    import urllib2
    from mylog import MyLog as mylog

    class DoubleColorBallItem(object):
    date=None #开奖日期
    order=None #当年的开奖顺序
    red1=None #第一个红色号码
    red12=None #第二个红色号码
    red3=None #第三个红色号码
    red4=None #第四个红色号码
    red5=None #第五个红色号码
    red6=None #第六个红色号码
    blue=None #蓝色号码
    money=None #彩池金额
    firstPrize=None #一等奖中奖人数
    secondPrize=None #二等奖中奖人数


    class GetDoubleColorBallNumber(object):
    def __init__(self):
    self.urls=[ ]
    self.log=mylog()
    self.getUrls()
    self.items=self.spider(self.urls)
    self.pipelines(self.items)


    def getUrls(self):
    URL=r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
    htmlContent=self.getResponseContent(URL)
    soup=BeautifulSoup(htmlContent,'lxml')
    tag=soup.find_all(re.compile('p'))[-1]
    pages=tag.strong.get_text()
    for i in xrange(1,int(pages)+1):
    url=r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_'+str(i)+'.html'
    self.urls.append(url)
    self.log.info(u'添加URL:%s到URLS ' %url)

    def getResponseContent(self,url):
    try:
    response=urllib2.urlopen(url.encode('utf8'))
    except:
    self.log.error(u'python 返回 URL:%s 数据失败' %url)
    else:
    self.log.info(u'Python 返回URL:%s A数据成功' %url)
    return response.read()

    def spider(self,urls):
    items=[ ]
    for url in urls:
    htmlContent=self.getResponseContent(url)
    soup=BeautifulSoup(htmlContent,'lxml')
    tags=soup.find_all('tr',attrs={ })
    for tag in tags:
    if tag.find('em'):
    item=DoubleColorBallItem()
    tagTd=tag.find_all('td')
    item.date=tagTd[0].get_text()
    item.order=tagTd[1].get_text()
    tagEm=tagTd[2].find_all('em')
    item.red1=tagEm[0].get_text()
    item.red2=tagEm[1].get_text()
    item.red3=tagEm[2].get_text()
    item.red4=tagEm[3].get_text()
    item.red5=tagEm[4].get_text()
    item.red6=tagEm[5].get_text()
    item.blue=tagEm[6].get_text()
    item.money=tagTd[3].find('strong').get_text()
    item.firstPrize=tagTd[4].find('strong').get_text()
    item.secondPrize=tagTd[5].find('strong').get_text()

    items.append(item)
    self.log.info(u'获取日期为:%s 的数据成功' %(item.date))

    return items
    def pipelines(self,items):
    fileName=u'双色球.txt'.encode('GBK')
    with open(fileName,'w') as fp:
    for item in items:
    fp.write('%s %s %s %s %s %s %s %s %s %s %s %s '
    %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize))

    self.log.info(u'将日期为:%s 的数据存入"%s"...' %(item.date,fileName.decode('GBK')))


    if __name__ == '__main__':
    GDCBN=GetDoubleColorBallNumber()

    (3)mylog.py文件的内容:
    #! /usr/bin/env python
    #-*- coding:utf-8 -*-

    import logging
    import getpass
    import sys

    class MyLog(object):
    ###########类MyLog的构造函数
    def __init__(self):
    self.user=getpass.getuser()
    self.logger=logging.getLogger(self.user)
    self.logger.setLevel(logging.DEBUG)

    #########日志文件名
    self.logFile=sys.argv[0][0:-3]+'.log'
    self.formatter=logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s ')

    #########日志显示到屏幕并输出到日志文件中
    self.logHand=logging.FileHandler(self.logFile,encoding='utf8')
    self.logHand.setFormatter(self.formatter)
    self.logHand.setLevel(logging.DEBUG)


    self.logHandSt=logging.StreamHandler()
    self.logHandSt.setFormatter(self.formatter)
    self.logHandSt.setLevel(logging.DEBUG)

    self.logger.addHandler(self.logHand)
    self.logger.addHandler(self.logHandSt)
    #####日志的5个级别对应以下5个函数
    def debug(self,msg):
    self.logger.debug(msg)
    def info(self,msg):
    self.logger.info(msg)

    def warn(self,msg):
    self.logger.warn(msg)

    def error(self,msg):
    self.logger.error(msg)
    def critical(self,msg):
    self.logger.critical(msg)


    if __name__ == '__main__':
    mylog=MyLog()
    mylog.debug(u"I'm debug 测试中文")
    mylog.info("I'm info")
    mylog.warn("I'm warn")
    mylog.error(u"I'm error 测试中文 ")
    mylog.critical("I'm critical")


    (4)运行结果:

    3.结果保存到Excel中:从第三方库中安装xlwt模块,其中xlrd模块负责从Excel中读取数据,xlwt则是将数据写入到Excel中;

    (2)先写一个简单的python程序测试一下,excelWrite.py:

    #! /usr/bin/env python
    #-*- coding:utf-8 -*-


    import xlwt

    if __name__ == '__main__':
    book=xlwt.Workbook(encoding='utf8',style_compression=0)
    sheet=book.add_sheet('dede')
    sheet.write(0,0,'hstking')
    sheet.write(1,1,u'中文测试'.encode('utf8'))
    book.save('F:\PythonPythonWebScraping\PythonBeautiSoupProject\winningNumBS4\getWinningNum\1.xls')

    在编辑器pycharm中打开1.xls文件:

    (3)编写文件sava2excel.py,和getWinningNum.py必须是在同一目录下:

    #! /usr/bin/env python
    #-*- coding:utf-8 -*-

    import xlwt

    class SaveBallDate(object):
    def __init__(self,items):
    self.items=items
    self.run(self.items)

    def run(self,items):
    fileName=u'双色球.xls'.encode('GBK')
    book=xlwt.Workbook(encoding='utf8')
    sheet=book.add_sheet('ball',cell_overwrite_ok=True)
    sheet.write(0,0,u'开奖日期'.encode('utf8'))
    sheet.write(0,1,u'期号'.encode('utf8'))
    sheet.write(0,2,u'红1'.encode('utf8'))
    sheet.write(0,3,u'红2'.encode('utf8'))
    sheet.write(0,4,u'红3'.encode('utf8'))
    sheet.write(0,5,u'红4'.encode('utf8'))
    sheet.write(0,6,u'红5'.encode('utf8'))
    sheet.write(0,7,u'红6'.encode('utf8'))
    sheet.write(0,8,u'蓝'.encode('utf8'))
    sheet.write(0,9,u'销售金额'.encode('utf8'))
    sheet.write(0,10,u'一等奖'.encode('utf8'))
    sheet.write(0,11,u'二等奖'.encode('utf8'))

    i=1
    while i<len(items):
    item=items[i-1]
    sheet.write(i,0,item.date)
    sheet.write(i,1,item.order)
    sheet.write(i,2,item.red1)
    sheet.write(i,3,item.red2)
    sheet.write(i,4,item.red3)
    sheet.write(i,5,item.red4)
    sheet.write(i,6,item.red5)
    sheet.write(i,7,item.red6)
    sheet.write(i,8,item.blue)
    sheet.write(i,9,item.money)
    sheet.write(i,10,item.firstPrize)
    sheet.write(i,11,item.secondPrize)
    i+=1
    book.save(fileName)

    if __name__ == '__main__':
    pass


    (4)修改getWinningNum.py文件:

    (5)运行结果,在编辑器中打开文件双色球.xls:

    
    
    
    
  • 相关阅读:
    C#之枚举
    C#之判断字母大小、字母转ACII码
    C#之BF算法
    md5如何实现encodePassword加密方法
    基本配置及安全级别security-level
    js中“原生”map
    web.xml讲解
    java application指的是什么
    .conf、.bak是什么格式
    Maven系列--web.xml 配置详解
  • 原文地址:https://www.cnblogs.com/hqutcy/p/7379915.html
Copyright © 2011-2022 走看看