zoukankan      html  css  js  c++  java
  • 爬虫(urllib, beatifulsoup的基本使用)

    代码如下:

    完成了html代码中提取需要的信息操作
    
    
    import ssl
    from bs4 import BeautifulSoup
    import re
    import urllib.request, urllib.error
    import xlwt
    import sqlite3
    
    def main():
        ssl._create_default_https_context = ssl._create_unverified_context
        baseUrl = 'https://movie.douban.com/top250?start='
        getData(baseUrl)
    
    findLink = re.compile(r'<a href="(.*?)">', re.S)
    findImg = re.compile(r'src="(.*?)"', re.S)
    findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
    findIto = re.compile(r'<p class="">(.*?)</p>', re.S)
    findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>', re.S)
    findInq = re.compile(r'<span class="inq">(.*?)</span>', re.S)
    #爬取网页
    def getData(baseUrl):
        global dlist
        dlist = []
        for i in range(0, 10):
            temp= []
            url = baseUrl + str(i*25)
            html = askUrl(url)
            #逐一解析数据
            soup = BeautifulSoup(html, "html.parser")
            for item in soup.find_all('div', class_="item"):
                content = []
                # print(item)
                item = str(item)
                #目标链接地址
                link = re.findall(findLink, item)[0]
                content.append(link)
                #图片地址
                pic = re.findall(findImg, item)
                content.append(pic)
                #标题
                title = re.findall(findTitle, item)
                # print(title)
                if (len(title) == 1):
                    content.append(title[0])
                    content.append(' ')
                else:
                    content.append(title[0])
                    content.append(title[1])
                #介绍
                ito = re.findall(findIto, item)[0]
                ito = re.sub(r'
    ', '', ito)
                ito = re.sub(r'...<br/>', '', ito)
                ito = re.sub(r' ', '', ito)
                content.append(ito)
                #评分
                rating = re.findall(findRating, item)[0]
                content.append(rating)
                #inq
    
                inq = re.findall(findInq, item)
                content.append(inq)
                dlist.append(content)
        print(len(dlist))
        return dlist
    #得到指定一个URL的网页内容
    
    def askUrl(url):
        head = {  #用来模拟请求头信息
            "User-Agent": "Mozilla / 5.0(Macintosh;IntelMacOSX10_15_3) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.122Safari / 537.36"
        }
        request = urllib.request.Request(url, headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            # print(html)
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
        return html
    if __name__ == "__main__":
        main()
    
    
    
    
    
    
  • 相关阅读:
    vs2017 离线安装。
    c# begin & end.
    vc++ 下的WaitForSingleObject
    c# 工厂模式 ,委托 ,事件。
    微信分享 andriod studio
    mac osx 10.9 ftp server端口
    win32 调用多媒体函数PlaySound()
    [汇编语言]-第九章 在屏幕中间分别显示绿底红色,白底蓝色字符串"welcome to masm!"
    [汇编语言]-第九章 jcxz,loop指令,转移位移的意义
    补码
  • 原文地址:https://www.cnblogs.com/jackson1/p/12776794.html
Copyright © 2011-2022 走看看