zoukankan      html  css  js  c++  java
  • 爬虫练习

    #豆瓣电影 re爬虫
    import
    requests,re,csv url = "https://movie.douban.com/top250" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) page_connect = resp.text #解析数据 obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)' r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?<span ' r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?' r'<span>(?P<num>.*?)人评价</span>',re.S) #开始匹配 result = obj.finditer(page_connect) f = open("data.csv",mode="w") csvwriter = csv.writer(f) for it in result: # print(it.group("name")) # print(it.group("score")) # print(it.group("num")) # print(it.group("year").strip()) #srtip去空格 #使用字典 dic = it.groupdict() dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() print("over!")
    #电影天堂 re爬虫
    import
    requests,re,csv url = "https://www.dytt8.net/" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers,verify=False) resp.encoding = 'gb2312' #指定字符集 #匹配ul的ui obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S) obj3 = re.compile(r'◎片  名 (?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S) result1 = obj1.finditer(resp.text) #保存子页面 list = [] for it in result1: ul = it.group('ul') #提取子页面链接 result2 = obj2.finditer(ul) for itt in result2: #子页面链接 url2 = url + itt.group('href').strip("/") list.append(url2) #print(url2) #提取子页面内容 for href in list: url2 = requests.get(href, headers=headers, verify=False) url2.encoding = 'gb2312' # 指定字符集 #print(url2.text) result3 = obj3.search(url2.text) print(result3.group("movie")) print(result3.group("download")) break
    #bs4爬虫
    import requests,re,csv
    from bs4 import BeautifulSoup
    url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml"
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    resp = requests.get(url,headers=headers)
    resp.encoding = 'utf-8' #指定字符集
    #写入文件
    f = open("菜价.csv",mode="w")
    csvwriter = csv.writer(f)
    #解析数据
    #1.把源代码交给beautifulSoup进行处理,生成bs对象
    page = BeautifulSoup(resp.text,"html.parser") #指定html解析
    #2.从bs对象查找数据
    #find(标签,属性=值)
    #find_all(标签,属性=值)
    #第一种写法
    #div = page.find("div",class_="m-r-main m-textLists")
    #class是python的关键字,这里用class_区分,防止报错
    #第二种写法
    div = page.find("div",attrs={"class":"m-r-main m-textLists"})
    #第二种可以避免class
    #拿到所有数据行tr
    trs = div.find_all("tr")[1:]  #[1:]做切片 从第1个开始
    for tr in trs: #每行数据
        tds = tr.find_all("td") #每行的td
        name = tds[0].text
        class1 = tds[1].text
        high = tds[2].text
        avg = tds[3].text
        #print(name,class1,high,avg)
        csvwriter.writerow([name,class1,high,avg])
    f.close()
    print("over!")
    #彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本
    import requests,re,csv,time
    from bs4 import BeautifulSoup
    url = "https://pic.netbian.com/4kmeinv/"
    url1 = "https://pic.netbian.com"
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    resp = requests.get(url,headers=headers)
    resp.encoding = 'gbk' #指定字符集
    #print(resp.text)
    #解析数据
    page = BeautifulSoup(resp.text,"html.parser") #指定html解析
    #2.从bs对象查找数据
    #find(标签,属性=值)
    #find_all(标签,属性=值)
    div = page.find("div",class_="slist").find_all("a")
    #print(div)
    for a in div:
        href = url1+(a.get('href'))
        #print(href)
        #获取子页面源码
        resp2 = requests.get(href, headers=headers)
        resp2.encoding = 'gbk'  # 指定字符集
        page2 = BeautifulSoup(resp2.text, "html.parser")
        div2 = page2.find("div",class_="photo-pic")
        img = div2.find("img")
        src = url1+(img.get("src"))
        #print(src)
    #下载图片
        img_resp = requests.get(src)
        #img_resp.content #获取字节
        img_name = src.split("/")[-1]
        # 获取最后/的内容,举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg
        # 从中获取 102129-163037648996ad.jpg
        with open("img/"+img_name,mode="wb") as f:   #放入img文件夹
            f.write(img_resp.content) #图片内容写入文件
            f.close()
        print(img_name +" is Download  OK")
        time.sleep(0.5)
    print("OVER")
    #线程池+xpath提取
    import requests,re,csv,lxml
    from lxml import etree
    from concurrent.futures import ThreadPoolExecutor
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    f = open("1.csv",mode="w",encoding="UTF-8")
    csvwriter = csv.writer(f)
    def page1(url):
        resp = requests.get(url,headers=headers)
        #resp.encoding = "UTF-8"  # 指定字符集
        #print(resp.text)
        html = etree.HTML(resp.text)
        table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0]
        #print(table)
        trs = table.xpath("./tr")
        #截取tr
        for tr in trs:
            txt = tr.xpath("./td/text()")
            #print(txt)
            #对数据做简单的处理
            txt = (item.replace("xa0","") for item in txt)
            #print(list(txt))
            #存放数据
            csvwriter.writerow(txt)
        print(url+"提取完成")
    if __name__ == '__main__':
        #page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html")
        #创建线程池
        with ThreadPoolExecutor(50) as t:     #500个线程
            for i in range(11,99):              #200个任务
                #任务提交到线程池
                t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html")
        print("全部提取完成")
  • 相关阅读:
    20210329 3. RocketMQ 高级实战
    20210329 2. RocketMQ 高级特性及原理
    20210329 1. RocketMQ 架构与实战
    20210329 0. RocketMQ 安装
    20210311 java.io.Serializable
    Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments
    Reinforcement Learning in Continuous Time and Space
    A Learning Theory for Reward-Modulated Spike-Timing-Dependent Plasticity with Application to Biofeedback
    Functional Requirements for Reward-Modulated Spike-Timing-Dependent Plasticity
    BindsNET学习系列 ——Reward
  • 原文地址:https://www.cnblogs.com/bingtang123/p/15374364.html
Copyright © 2011-2022 走看看