zoukankan      html  css  js  c++  java
  • 第一次爬虫测试

    球赛测试:

    # -*- coding: utf-8 -*-
    """
    Created on Thu May 23 00:41:48 2019
    
    @author: h2446
    """
    
    from random import random
    def printIntro():
        print("这个程序模拟两支排球队A和B的排球比赛")
        print("程序运行需要A和B的能力值(以0到1之间的小数表示)")
    def getInputs():
        a=eval(input("请输入队伍A的能力值(0~1):"))
        b=eval(input("请输入队伍B的能力值(0~1):"))
        n=eval(input("模拟比赛的场次:"))
        return a,b,n
    def simNGames(n,probA,probB):
        winsA,winsB=0,0
        for i in range(n):
            scoreA,scoreB=simOneGame(probA,probB)
            if scoreA>scoreB:
                winsA +=1
            else:
                winsB +=1
            return winsA,winsB
    def gameOver(a,b):
            if (a>25 and abs(a-b)>=2 )or(b>25 and abs(a-b)>=2):
                return True
            else:
                return False
    def simOneGame(probA,probB):
        scoreA,scoreB=0,0
        serving = "A"
        while not gameOver(scoreA,scoreB):
            if serving =="A":
                if random()<probA:
                    scoreA +=1
                else:
                    serving="B"
            else:
                if random()<probB:
                    scoreB +=1
                else:
                    serving="A"
        return scoreA,scoreB
    def final(probA,probB):
         winsA,winsB=simNGames1(4,probA,probB)
         printSummary(winsA,winsB)
         if not winsA==3 or winsB==3:
             if winsA==winsB==2:
                 winsA1,winsB1=simOneGame1(probA,probB)
                 finalprintSummary(winsA,winsB)
         else:
             finalprintSummary(winsA,winsB)
    def simNGames1(n,probA,probB):
         winsA,winsB=0,0
         for i in range(n):
             scoreA,scoreB=simOneGame2(probA,probB)
             if winsA==3 or winsB==3:
                 break
             if scoreA>scoreB:
                 winsA+=1
             else:
                 winsB+=1
         return winsA,winsB
    def simOneGame2(probA,probB):
         scoreA,scoreB=0,0
         serving="A"
         while not GG(scoreA,scoreB):
             if serving=="A":
                 if random() < probA:
                     scoreA += 1
                 else:
                     serving="B"
             else:
                 if random() < probB:
                     scoreB += 1
                 else:
                     serving="A"
         return scoreA,scoreB
    def simOneGame1(probA,probB):
        scoreA,scoreB=0,0
        serving="A"
        while not finalGameOver(scoreA,scoreB):
            if serving=="A":
                if random() < probA:
                    scoreA += 1
                else:
                      serving="B"
            else:
                if random() < probB:
                    scoreB += 1
                else:
                    serving="A"
                    return scoreA,scoreB
    def GG(a,b):
        return a==3 or b==3
    def finalGameOver(a,b):
         if (a==8 or b==8):
             if a>b:
                 print("A队获得8分,双方交换场地")
             else:
                 print("B队获得8分,双方交换场地")
         if (a>15 and abs(a-b)>=2 )or(b>15 and abs(a-b)>=2):
             return True
         else:
             return False
    def finalprintSummary(winsA,winsB):
         n=winsA+winsB
         if n>=4:
             print("进行最终决赛")
             if winsA>winsB:
                 print("最终决赛由A获胜")
             else:
                 print("最终决赛由B获胜")
         else:
                if winsA>winsB:
                    print("最终决赛由A获胜")
                else:
                    print("最终决赛由B获胜")
    def printSummary(winsA,winsB):
            n=winsA+winsB
            print("竞技分析开始,共模拟{}场比赛".format(n))
            print("选手A获胜{}场比赛,占比{:0.1%}".format(winsA,winsA/n))
            print("选手B获胜{}场比赛,占比{:0.1%}".format(winsB,winsB/n))
    def main():
            printIntro()
            probA,probB,n=getInputs()
            winsA,winsB=simNGames(n,probA,probB)
            printSummary(winsA,winsB)
            final(probA,probB)
    try:
        main()
    except:
        print("Error")

    测试结果:

    使用requests库中的get()函数访问如下一个网站20次(这里采用必应网为例子),打印返回状态,text内容,计算text()属性和content属性所返回网页内容的长度。

    import requests
    from bs4 import BeautifulSoup
    def getHTMLText(r):
        x = r.status_code
        if x == 200 :
            print("访问成功")
        if x == 404:
            print("访问失败")
        return ''
    r = requests.get("https://cn.bing.com/",timeout = 100)
    for i in range(1,21):          
        print("第{}次".format(i))
        getHTMLText(r)
    url="https://cn.bing.com/"
    r=requests.get(url)
    print(r.status_code)
    r.encoding='UTF-8'
    print(r.text)

    print("text属性长度{}".format(len(r.text)))
    print("content属性长度{}".format(len(r.content)))

     

    返回状态:

    text部分内容:

    text()属性和content属性所返回网页内容的长度:

     

     html页面的简单相关计算

    先将下面的代码保存到电脑上,并使用浏览器打开

    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="utf-8">
    <title>菜鸟教程(runoob.com)</title>
    </head>
    <body>
        <h1>我的第一个标题</h1>
        <p id=first>我的第一个段落。</p >
    </body>
            <table border="1">
        <tr>
            <td>班级</td>
            <td>2班</td>
        </tr>
        <tr>
            <td>我的学号</td>
            <td>3109</td>
        </tr>
    </table>
    </html>
    import requests
    from bs4 import BeautifulSoup
    f = open("C:/Users/86183/Desktop/.html",'r',encoding="utf-8")
    r = f.read()
    f.close()
    soup = BeautifulSoup(r,"html.parser")
    print("(1)head标签:
    ",soup.head)
    print("(2)body标签内容:
    ",soup.title)
    print("(3)id为first的标签对象:
    ",soup.find(id='first'))
    print("(4)获取该html中的中文字符:",soup.find('h1').string,
       soup.find(id='first').string)

    运行截图如下:

     爬取中国大学网站的相关内容,并保存为CSV格式,代码如下

    import requests
    from bs4 import BeautifulSoup
    def GetHTMLText(url):
        try :
            r = requests.get(url,timeout = 30)
            r.raise_for_status()
            r.encoding = "utf-8"
            return  r.text
        except:
            return ""
    def FindUnivList(soup):
        data = soup.find_all('tr')
        
        lth = data[0].find_all('th')          #通过查看网页html代码,找到第一个'tr'标签下的'th'标签就是表格标题。
        thead = []
        for th in lth[:4]:                      #前四个表格标题(第五个特殊,下面处理)
            thead.append(th.string)
        tOptions = data[0].find_all('option')     #处理第五个特殊的表格标题
        for tOption in tOptions:                 
            thead.append(tOption.string)
        allUniv.append(thead)
        
        for tr in data:                         #获取数据内容
            ltd = tr.find_all('td')
            if len(ltd) ==  0:
                continue
            tbodies = []
            for td in ltd:
                if td.string is None:           #对无数据内容处理
                    tbodies.append('')
                    continue
                tbodies.append(td.string)
            allUniv.append(tbodies)
    
    
    def main():
        url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html"
        html = GetHTMLText(url)
        soup = BeautifulSoup(html,"html.parser")
        FindUnivList(soup)
        #储存为csv
        f = open('2019中国大学排名.csv','w',encoding='utf-8')
        for row in allUniv:
            f.write(','.join(row)+'
    ')
        f.close()
    
    allUniv = []
    main()
    View Code

    截图如下:

  • 相关阅读:
    常用的20个正则表达式
    关于position:fixed;的居中问题
    Html table 合并单元格
    JS异步加载的三种方式
    DOM事件代码小结
    用js写一个回车键盘事件
    JavaScript 常用方法总结
    可拖拽进度条
    js数组拍平
    js验证码倒计时
  • 原文地址:https://www.cnblogs.com/kwjl/p/12907473.html
Copyright © 2011-2022 走看看