zoukankan      html  css  js  c++  java
  • 数据采集与融合第二次作业

    作业①

    1)爬取七天天气实验

    要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
    代码部分

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import sqlite3
    
    
    class WeatherDB:
        def openDB(self):
            self.con = sqlite3.connect("weathers.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute(
                    "create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
            except:
                self.cursor.execute("delete from weathers")
    
        def closeDB(self):
            self.con.commit()
            self.con.close()
    
        def insert(self, city, date, weather, temp):
            try:
                self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values(?,?,?,?)",
                                    (city, date, weather, temp))
            except Exception as err:
                print(err)
    
        def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "data", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
    
    
    class WeatherForecast:
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
            self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
    
        def forecastCity(self, city):
            if city not in self.cityCode.keys():
                print(city + "code cannot be found")
                return
            url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
            try:
                req = urllib.request.Request(url, headers=self.headers)
                data = urllib.request.urlopen(req)
                data = data.read()
                dammit = UnicodeDammit(data, ["utf-8", "gbk"])
                data = dammit.unicode_markup
                soup = BeautifulSoup(data, "lxml")
                lis = soup.select("ul[class='t clearfix'] li")
                for li in lis:
                    try:
                        date = li.select('h1')[0].text
                        weather = li.select('p[class="wea"]')[0].text
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                        self.db.insert(city, date, weather, temp)
                    except Exception as err:
                        print(err)
            except Exception as err:
                print(err)
    
        def process(self, cities):
            self.db = WeatherDB()
            self.db.openDB()
    
            for city in cities:
                self.forecastCity(city)
    
            self.db.show()
            self.db.closeDB()
    
    
    ws = WeatherForecast()
    ws.process(["北京", "上海", "广州", "深圳"])
    

    运行结果

    2)心得体会

    本次实验主要是对书本上代码的复现,加强了自己对BeautifulSoup库的使用,对sqlite3有了一些了解,不过好像还是有bug,晚上爬取的时候是无法爬取到当天的天气情况的,这个有点懵。

    作业②

    要求:用requests和BeautifulSoup库方法定向爬取股票相关信息。
    候选网站:东方财富网:https://www.eastmoney.com/
    ​新浪股票:http://finance.sina.com.cn/stock/
    技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
    参考链接:https://zhuanlan.zhihu.com/p/50099084

    1)爬取股票信息实验

    思路:因为网页是动态加载的,一开始入手的时候选用Beautiful库中的方法,但是这样其实是无法爬取到具体信息的,所以后面获取到实际的url,再通过正则表达式来匹配。
    大佬室友新指点,发现不同股票之间的差异字段是fs,而pn和pz是页码以及一页中股票的数量。

    代码部分

    import re
    import requests
    pn = input("请输入要爬取的页面:")
    count = 1
    i = 1
    fs = ""
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
            }
    fs_list = {      #通过比对多只股票的url,发现fs字段是区分不同股票的标志,这里就列出3种
        "沪深A股":"fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
        "上证A股":"fs=m:1+t:2,m:1+t:23",
        "深圳A股":"fs=m:0+t:6,m:0+t:13,m:0+t:80"
    }
    tplt = "{0:{13}<5}{1:{13}<5}{2:{13}<5}{3:{13}<5}{4:{13}<5}{5:{13}<5}{6:{13}<5}{7:{13}^10}{8:{13}<5}{9:{13}<5}{10:{13}<5}{11:{13}<5}{12:{13}<5}"
    print(tplt.format("序号","股票代码","股票名称","最新报价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今开","昨收",chr(12288)))
    for key in fs_list:
        print("正在爬取"+key+"的数据")
        fs = fs_list[key]
        for i in range(1,int(pn)+1):
            url = "http://69.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124018833205583945567_1601998353892&pn="+str(i)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&"+str(fs)+"&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1601998353939"
            r = requests.get(url=url,headers=headers)  
            page_text = r.text
            # print(page_text)
            reg = '"diff":[(.*?)]'    #把我们需要的数据爬取出来
            list1 = re.findall(reg,page_text)  
            #print(list)   
            list2 = re.findall('{(.*?)}',list1[0],re.S)   #把每支股票匹配出来
            #print(len(list2))
            #print(list2)
            for j in list2:    
                list3 = []
                list3.append(j.split(","))  #用逗号分割之后,方便我们提取需要的数据,接下来只要观察就行了
                #print(list3)
                num = list3[0][11].split(":")[-1].replace(""","")   #把引号替换为空字符串,这里要转义
                name = list3[0][13].split(":")[-1].replace(""","")
                new_price = list3[0][1].split(":")[-1].replace(""","")
                crease_range = list3[0][2].split(":")[-1].replace(""","")+"%"
                crease_price = list3[0][3].split(":")[-1].replace(""","")
                com_num = list3[0][4].split(":")[-1].replace(""","")
                com_price = list3[0][5].split(":")[-1].replace(""","")
                move_range = list3[0][6].split(":")[-1].replace(""","")+"%"
                top = list3[0][14].split(":")[-1].replace(""","")
                bottom = list3[0][15].split(":")[-1].replace(""","")
                today = list3[0][16].split(":")[-1].replace(""","")
                yestoday = list3[0][17].split(":")[-1].replace(""","")
                print('%-10s%-10s%-10s%-10s%-10s%-10s%-10s%-10s%10s%10s%10s%10s%10s' %(count,num,name,new_price,crease_range,crease_price,com_num,com_price,move_range,top,bottom,today,yestoday))
                count += 1
            print("=================================================="+key+"第"+str(i)+"页内容打印成功===============================================")
    

    运行结果

    2)心得体会

    本次实验花费了不少时间,一开始实验的时候用的是select,定位标签的时候调试了很久,定位到自己要爬取的内容时就没有返回值了,这才知道不能再使用Beautifulsoup来做了,后面就对比url也花费了不少时间,感觉本次实验收获还是蛮大的。

    作业③

    要求:根据自选3位数+学号后3位选取股票,获取印股票信息。抓包方法同作②。
    候选网站:东方财富网:https://www.eastmoney.com/
    新浪股票:http://finance.sina.com.cn/stock/

    1)爬取股票信息实验

    思路:感觉有了第二次实验的铺垫,比较容易。不过自选的3位数股票不一定存在,所以这里采用了遍历,遇到后三位能够匹配的上的股票代码,就直接break。本次实验我只遍历了沪深A股,发现匹配地蛮快的,就没有把所有股票都遍历一遍了。
    代码部分

    import re
    import requests
    flag = 0
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
            }
    print("股票代码","股票名称","今日开","今日最高","今日最低")
    for i in range(1,211):
        url = "http://69.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124018833205583945567_1601998353892&pn="+str(i)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1601998353939"
        r = requests.get(url=url,headers=headers)  
        page_text = r.text
        reg = '"diff":[(.*?)]'    #把我们需要的数据爬取出来
        list1 = re.findall(reg,page_text)  
        list2 = re.findall('{(.*?)}',list1[0],re.S)   #把每支股票匹配出来
        for j in list2:    
            list3 = []
            list3.append(j.split(","))  #用逗号分割之后,方便我们提取需要的数据,接下来只要观察就行了
            num = list3[0][11].split(":")[-1].replace(""","")   #把引号替换为空字符串,这里要转义
            name = list3[0][13].split(":")[-1].replace(""","")
            top = list3[0][14].split(":")[-1].replace(""","")
            bottom = list3[0][15].split(":")[-1].replace(""","")
            today = list3[0][16].split(":")[-1].replace(""","")
            num = str(num)
            if(num.endswith("106")):
                print(num,name,today,top,bottom)
                flag = 1      
                break
        if(flag==1):
            print("所需指定股票已获取!!!")
            break
    if(flag==0):
        print("未找到指定股票")
    

    运行结果

    2)心得体会

    emmmm,感觉自己可能曲解老师的题意了。。。。。。

  • 相关阅读:
    接口测试
    Excel
    day16 内置函数作业
    day16 内置函数和匿名函数
    day15 内置函数
    day14 生成器进阶
    day13迭代器
    day13生成器
    day11 作业
    day11 装饰器
  • 原文地址:https://www.cnblogs.com/chu-3/p/13782250.html
Copyright © 2011-2022 走看看