zoukankan      html  css  js  c++  java
  • 第二次作业

    作业①

    1)WeatherForecast

    要求:在中国气象网http://www.weather.com.cn给定城市集的7日天气预报,并保存在数据库。
    代码:

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import sqlite3
    
    
    class WeatherDB:  # 包含对数据库的操作
        def openDB(self):
            self.con = sqlite3.connect('weathers.db')
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute(
                    'create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key(wCity,wDate))')
            except:  # 第一次创建表格是成功的;第二次创建就会清空表格
                self.cursor.execute('delete from weathers')
    
        def closeDB(self):
            self.con.commit()
            self.con.close()
    
        def insert(self, city, date, weather, temp):
            try:
                self.cursor.execute('insert into weathers (wCity,wDate,wWeather,wTemp)values(?,?,?,?)',
                                    (city, date, weather, temp))                ## 爬取城市的天气预报数据储存到数据库weather.db中
            except Exception as err:
                print(err)
    
        def show(self):
            self.cursor.execute('select * from weathers')   #执行查询语句,输出表
            rows = self.cursor.fetchall()
            print('%-16s%-16s%-32s%-16s' % ('city', 'date', 'weather', 'temp'))
            for row in rows:
                print('%-16s%-16s%-32s%-16s' % (row[0], row[1], row[2], row[3]))   
    
    
    class WeatherForecast:  
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre)Gecko/2019100821 Minefield/3.0.2pre'}  # 创建头,伪装成服务器/浏览器访问远程的web服务器
            self.cityCode = {'北京': '101010100', '上海': '101020100', '广州': '101280101', '深圳': '101280601'}  # 查找的城市
    
        def forecastCity(self, city):
            if city not in self.cityCode.keys():
                print(city + 'code cannot be found')
                return
    
            url = 'http://www.weather.com.cn/weather/' + self.cityCode[city] + '.shtml' 
            try:
                req = urllib.request.Request(url, headers=self.headers)  
                data = urllib.request.urlopen(req)
                data = data.read()
                dammit = UnicodeDammit(data, ['utf-8'], 'gbk')
                data = dammit.unicode_markup
                soup = BeautifulSoup(data, 'lxml')
                lis = soup.select("ul[class='t clearfix'] li")  # 找到每一个天气数据
                for li in lis:
                    try:
                        date = li.select('h1')[0].text
                        weather = li.select('p[class="wea"]')[0].text
                        temp = li.select('p[class="tem"] span')[0].text + '/' + li.select('p[class="tem"] i')[0].text
                        print(city, date, weather, temp)
                        self.db.insert(city, date, weather, temp)  # 插入到数据库的记录
                    except Exception as err:
                        print(err)
            except Exception as err:
                print(err)
    
        def process(self, cities):
            self.db = WeatherDB()
            self.db.openDB()
            for city in cities:
                self.forecastCity(city)  # 循环每一个城市
            self.db.show()
            self.db.closeDB()
    
    ws = WeatherForecast()
    ws.process(['北京', '上海', '广州', '深圳'])
    print('completed')
    
    

    运行结果部分截图

    2)心得体会:

    这次的代码就是按照书上敲的,遇到不懂的也通过度娘解决了,这次主要是加强了Beautiful的使用以及对sqlite数据库有初步了解。

    作业②

    2)

    要求:用requests和BeautifulSoup库方法定向爬取股票相关信息。
    候选网站:东方财富网https://www.eastmoney.com/

    ​新浪股票http://finance.sina.com.cn/stock/

    技巧:在谷歌浏览器中进入F12调试模式进行抓包,查找股票列表加载使用的url,并分析api返回的值,并根据所要求的参数可适当更改api的请求参数。根据URL可观察请求的参数f1、f2可获取不同的数值,根据情况可删减请求的参数。
    思路:
    通过数据的url对比,可以发现同一板块的数据页数对应的是pn参数

    而同一页数不同板块的数据对应的是fid和fs参数

    参考链接:https://zhuanlan.zhihu.com/p/50099084
    代码:

    import requests
    import re
    import math
    
    # 用get方法访问服务器并提取页面数据
    def getHtml(cmd, page):
        url = "http://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409784442493077996_1601810442107&pn=" + str(
            page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
        r = requests.get(url)
        pat = ""diff":[(.*?)]"
        data = re.compile(pat, re.S).findall(r.text)[0]
        all_page = math.ceil(eval(re.findall('"total":(d+)', r.text)[0]) / 20)    #获取板块的数据条数,除20向上取整就是页数
        return data, all_page
    
    # 获取单个页面股票数据
    def getOnePageStock(cmd, page):
        data, all_page = getHtml(cmd, page)
        datas = data.split("},")          #分解每条股票
        global p
        for i in range(len(datas)):
            p += 1
            stocks = re.sub('["{}]', '', datas[i]).split(",")     #分解股票的每条属性
            print(tplt.format(p, stocks[11].split(":")[1], stocks[13].split(":")[1], stocks[1].split(":")[1],      #输出股票内容
                              stocks[2].split(":")[1], stocks[3].split(":")[1],
                              stocks[4].split(":")[1], stocks[5].split(":")[1], stocks[6].split(":")[1],
                              stocks[14].split(":")[1], stocks[15].split(":")[1],
                              stocks[16].split(":")[1], stocks[17].split(":")[1], chr(12288)))
    
    
    cmd = {
        "沪深A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",            
        "上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
        "深证A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80",
        "新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
        "中小板": "fid=f3&fs=m:0+t:13",
        "创业板": "fid=f3&fs=m:0+t:80"
    }
    for i in cmd.keys():
        tplt = "{0:^13}{1:^13}{2:{13}^13}{3:^13}{4:^13}{5:^13}{6:^13}{7:^13}{8:^13}{9:^13}{10:^13}{11:^13}{12:^13}"
        print(i)
        print("{0:^11}{1:^11}{2:{13}^12}{3:^12}{4:^12}{5:^12}{6:^10}{7:^10}{8:^12}{9:^12}{10:^12}{11:^12}{12:^12}".format(
            "序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收", chr(12288)))
        page = 1
        p = 0
        stocks, all_page = getHtml(cmd[i], page)
        while True:
            page += 1
            if page <= all_page:                #页数判断
                getOnePageStock(cmd[i], page)
            else:
                break
    

    运行结果部分截图:

    2)心得体会:

    这次实验的是抓取js动态加载的网页,在获取数据方面花费的很长时间,再有就是板块的页数(最后还是借鉴同学的),还是有很大收获的。

    作业③

    3)

    要求:根据自选3位数+学号后3位选取股票,获取印股票信息。抓包方法同作②。
    候选网站:东方财富网https://www.eastmoney.com/

    ​新浪股票http://finance.sina.com.cn/stock/
    代码:

    import requests
    import re
    import math
    
    
    # 用get方法访问服务器并提取页面数据
    def getHtml(cmd, page):
        url = "http://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409784442493077996_1601810442107&pn=" + str(
            page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152"
        r = requests.get(url)
        pat = ""diff":[(.*?)]"
        data = re.compile(pat, re.S).findall(r.text)[0]
        all_page = math.ceil(eval(re.findall('"total":(d+)', r.text)[0]) / 20)
        return data, all_page
    
    
    # 获取单个页面股票数据
    def getOnePageStock(cmd, page):
        data, all_page = getHtml(cmd, page)
        datas = data.split("},")
        for i in range(len(datas)):
            stocks = re.sub('["{}]', '', datas[i]).split(",")
            if (stocks[11].split(":")[1] == "002105"):
                print(tplt.format("股票代码号", "股票名称", "今日开", "今日最高", "今日最低", chr(12288)))
                print(tplt.format(stocks[11].split(":")[1], stocks[13].split(":")[1], stocks[16].split(":")[1],
                                  stocks[14].split(":")[1],
                                  stocks[15].split(":")[1], chr(12288)))
                global p               #找到就输出,并置p=1,跳出循环
                p = 1
                break
    
    
    cmd = {
        "沪深A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
        "上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
        "深证A股": "fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80",
        "新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
        "中小板": "fid=f3&fs=m:0+t:13",
        "创业板": "fid=f3&fs=m:0+t:80"
    }
    p = 0
    for i in cmd.keys():
        tplt = "{0:^8}	{1:{5}^8}	{2:^8}	{3:^8}	{4:^8}"
        page = 1
        stocks, all_page = getHtml(cmd[i], page)
        # 自动爬取多页,并在结束时停止
        while True:
            page += 1
            if page <= all_page:
                getOnePageStock(cmd[i], page)
            else:
                break
            if p == 1:
                break
    if p == 0:
        print("没找到对应的股票代码")
    
    

    运行结果:

    2)心得体会:

    我好像是只是在第二题的基础上加了一个if判断,感觉没有理解题目的意思。。。

  • 相关阅读:
    致DBA:为什么你经常犯错,是因为你做的功课不够
    Hbase的shell命令学习
    mysql通过拷贝文件实现数据快速迁移实例
    项目领导力学习总结
    放权,从鞋柜开始
    不抱怨的世界
    定投我们自己
    mysql core文件的正确打开姿势
    2017小目标
    世界是有生命的(通向财富自由之路学习笔记十五)
  • 原文地址:https://www.cnblogs.com/11uxx/p/13779950.html
Copyright © 2011-2022 走看看