zoukankan      html  css  js  c++  java
  • 《鲜活的数据-第2章 处理数据》有关代码

    2.1.3 自动收集数据

    import urllib2
    
    page = urllib2.urlopen("https://www.wunderground.com/history/airport/ZHCC/2017/9/8/DailyHistory.html")
    
    from BeautifulSoup import BeautifulSoup
    
    soup = BeautifulSoup(page)
    
    images = soup.findAll('img')
    
    first_image = images[0]
    
    print first_image
    
    wxvalue = soup.findAll(attrs={"class":"wx-value"})
    
    print wxvalue
    
    print wxvalue[0]
    
    print wxvalue[0].span.string #AttributeError: 'NoneType' object has no attribute 'string'
    
    print wxvalue[0].contents[0].string
    
    for m in range(1, 13):
        for d in range(1, 32):
     
          # Check if already gone through month
          if (m == 2 and d > 28):
            break
          elif (m in [4, 6, 9, 11] and d > 30):
            break
     
          # Open wunderground.com url
          timestamp = '2016' + str(m) + str(d)
          print "Getting data for " + timestamp
          #url = "http://www.wunderground.com/history/airport/KBUF/2009/" + str(m) + "/" + str(d) + "/DailyHistory.html"
          url = "https://www.wunderground.com/history/airport/ZHCC/2016/" + str(m) + "/" + str(d) + "/DailyHistory.html"
          page = urllib2.urlopen(url)
     
          # Get temperature from page
          soup = BeautifulSoup(page)
          # dayTemp = soup.body.nobr.b.string
          dayTemp = soup.findAll(attrs={"class":"wx-value"})[0].contents[0].string
     
          # Format month for timestamp
          if len(str(m)) < 2:
            mStamp = '0' + str(m)
          else:
            mStamp = str(m)
     
          # Format day for timestamp
          if len(str(d)) < 2:
            dStamp = '0' + str(d)
          else:
            dStamp = str(d)
     
          # Build timestamp
          timestamp = '2016' + mStamp + dStamp
     
          # Write timestamp and temperature to file
          print timestamp + ',' + dayTemp + '
    '
    

    终端输入并运行文件

    python get-weather-data.py

    2.2.3 用代码来格式化

    1. CSV转为XML

    import csv
    
    reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
    
    print '<weather_data>'
    
    for row in reader:
        print '<observation>'
        print '<date>' + row[0] + '</date>'
        print '<temperature>' + row[1] + '</temperature>'
        print '</observation>'
    
    print '</weather_data>'
    

    终端输入并运行文件

    python csv2xml.py >wunder-data1.xml

    f = open('wunder-data.xml', 'w')
    
    f.write('<weather_data>')
    
    for row in reader:
        f.write( '<observation>')
        f.write( '<date>' + row[0] + '</date>')
        f.write( '<temperature>' + row[1] + '</temperature>')
        f.write( '</observation>')
    
    f.write( '</weather_data>')
    
    f.close()
    

    2. XML转为CSV

    from BeautifulSoup import BeautifulStoneSoup
    
    f = open('wunder-data.xml', 'r')
    xml = f.read()
    soup = BeautifulStoneSoup(xml)
    observations = soup.findAll('observation')
    for o in observations:
        print o.date.string + "," + o.temperature.string
    

    终端输入并运行文件

    python xml2csv.py >wunder-data1.txt

    3. CSV转为JSON

    import csv
    
    reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
    
    print '{ "observations": ['
    rows_so_far = 0
    for row in reader:
        
        rows_so_far += 1
        
        print '{' 
        print '"date": ' + '"' + row[0] + '", '
        print '"temperature": ' + row[1] 
        
        if rows_so_far < 365:
            print " },"
        else:
            print " }"
        
    print "] }"
    

    终端输入并运行文件

    python csv2json.py >wunder-data1.json

    4.在循环中加入新的逻辑

    import csv
    
    reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")
    
    for row in reader:
        if int(row[1]) <= 32:
            is_freezing = '1'
        else:
            is_freezing = '0'
        
        print row[0] + "," + row[1] + "," + is_freezing
    

    终端输入并运行文件

    python freezingInfo.py >wunder-data-fz.txt

    
    
  • 相关阅读:
    经典的ajax遍历循环
    fastadmin别名关联表与js下划线冲突问题解决:with里的别名,不要用驼峰,用shippingtype
    thinkphp 临时关闭布局,ajax只输出主题部分
    linux php5.6 链接sql server
    ecstore导入文件开发问题解决 死循环+不兼容mac换行解决
    thinkphp 二级目录安装
    解决crontab执行时间与系统时间不一致的问题
    电阻性能检测的二种方法(转载)
    短路的原因与危害有哪些
    潮湿引发的电路板常见故障(转载)
  • 原文地址:https://www.cnblogs.com/opengl/p/7506439.html
Copyright © 2011-2022 走看看