zoukankan      html  css  js  c++  java
  • 简单爬虫

     一.简单爬虫

    目标——写一个爬虫来爬取常用搜索引擎(比如百度、搜狗等)的首页。

    手段——使用 python 的 requests 、BeautifulSoup4 与 lxml 库。

    实现方式—— 

    首先,我们要获取到网页,这个不难,可以使用 requests.get() 解决,如下:

    # coding = utf-8
    
    import requests
    
    url = "http://www.baidu.com"
    
    #设置获取网页的时间限制,超时就报错,防止网络延时而太久没反馈
    r = requests.get(url, timeout = 1)
    
    #如果返回状态码为 200,则说明网页连接成功
    print(r.status_code)

    在 windows 10 的 cmd 下,运行结果如下

    # encoding = utf-8
    import requests
    from lxml import html
    from bs4 import BeautifulSoup
    
    # 教训不要给文件取名与库名相同,否则会给程序扫描文件出错(找库时出错)
    url = [
        "http://www.baidu.com",
        "http://www.google.com",
        "http://www.sogou.com",
        "http://www.bing.com",
        "http://www.so.com"
    ]
    
    name = ["baidu", "google", "sogou", "bing", "360"]
    '''
    for i in range(5): 
        r = requests.get( url[i], timeout = 10)
        r.encoding = 'utf-8'
        tree = html.fromstring(r.text)
        urls = []
        if r.status_code == 200:
            #create html doc and save it
            with open("D:\{}.html".format(name[i]), 'w', encoding='utf-8') as f:
                f.write(r.text)
            print("This is {} times: Successful!".format(i))
            for i in tree.xpath("//@href"):
                urls.append(i)
            for i in range(len(urls)):
                print(urls[i])
    
        else:
            print("This is {} times: False!".format(i))
    
    
    for i in range(20):
        r = requests.get(url[0], timeout = 1)
        print(r.status_code)
    r.encoding = 'utf-8'
    
    print("type of text: ", type(r.text))
    print("type of content: ", type(r.content))
    
    soup = BeautifulSoup(r)
    print(soup.get_text())
    #print(r.text)

    处理一个 HTML 文件

    a.打印 body 标签的内容

    b.获取 body 标签的内容

    c.获取 id 为 first 的标签对象

    d.获取并打印 HTML 页面的中文字符

    from lxml import html
    from bs4 import BeautifulSoup
    
    html_doc = """
    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="utf-8">
    <title>菜鸟教程(runoob.com)</title>
    </head>
    <body>
        <h1>我的第一个标题</h1>
        <p>我的第一个段落。</p>
    </body>
            <table border="1">
        <tr>
            <td>row 1, cell 1</td>
            <td>row 1, cell 2</td>
        </tr>
        <tr>
            <td>row 2, cell 1</td>        
            <td>row 2, cell 2</td>        
        </tr>
    </table>
    </html>
    """    
    #使用 BeautifulSoup 解析网页,并得到一个 BeautifulSoup 的对象
    soup = BeautifulSoup(html_doc)
    
    #输出网页源码
    print(soup.prettify())
    
    #输出网页中的文本信息
    #print(soup.get_text())
    text = soup.get_text()
    
    print("---------------")
    #print()
    
    #输出网页中文本信息的长度(行数)
    print(len(soup.contents))

    牛刀小试

    爬取中国大学排名内容,http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html

    爬取2015年的信息。

    # coding = utf-8
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    client = MongoClient()
    db = client['UnivRanking']
    collection = db['UnivRanking']
    
    def save_to_mongo(result):
        try:
            if collection.insert(result):
                print('Save to Mongo')
        except:
            print("错误")
    
    allUniv = []
    
    def getHTMLText(url):
        try:
            r = requests.get(url, timeout = 30)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            return ""
    
    def fillUnivList(soup):
        data = soup.find_all('tr')
        for tr in data:
            ltd = tr.find_all('td')
            if len(ltd) == 0:
                continue
    
            singleUniv = []
    
            for td in ltd:
                singleUniv.append(td.string)
    
            allUniv.append(singleUniv)
    
    def printUnivList(num):
        
        print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^8}{6:{0}^8}{7:{0}^8}".format(chr(12288), "排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"))
    
        for i in range(num):
            u = allUniv[i]
            print("{1:{0}^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}{6:{0}^10}{7:{0}^10}".format(chr(12288), u[0], u[1], u[2], u[3], u[4], u[5], u[6]))
            
        name = ["排名", "学校名称", "省市", "总分", "人才培养得分", "科学研究得分", "社会服务得分"]
        test=pd.DataFrame(columns=name, data= allUniv)
        print(test)
        test.to_csv('testcsv.csv',encoding='utf-8')
        
    
    def main(num):
    
        url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2015_0.html"
    
        html = getHTMLText(url)
        soup = BeautifulSoup(html, "html.parser")
        fillUnivList(soup)
        printUnivList(num)
        save_to_mongo(allUniv)
    
    if __name__ == "__main__":
        
        main(100)
  • 相关阅读:
    转:spring 的控制反转
    jsp 页面间传递参数
    Struts-config.xml配置文件《action-mappings》元素的详解
    转:装饰模式
    转:策略模式
    MyBatis的动态SQL详解
    MyBatis配置
    spring与mybatis三种整合方法
    sqlserver 脚本 多条记录遍历
    SQL Server 游标使用
  • 原文地址:https://www.cnblogs.com/alinger/p/10932116.html
Copyright © 2011-2022 走看看