zoukankan      html  css  js  c++  java
  • 爬虫练习

    #豆瓣电影 re爬虫
    import
    requests,re,csv url = "https://movie.douban.com/top250" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) page_connect = resp.text #解析数据 obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)' r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?<span ' r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?' r'<span>(?P<num>.*?)人评价</span>',re.S) #开始匹配 result = obj.finditer(page_connect) f = open("data.csv",mode="w") csvwriter = csv.writer(f) for it in result: # print(it.group("name")) # print(it.group("score")) # print(it.group("num")) # print(it.group("year").strip()) #srtip去空格 #使用字典 dic = it.groupdict() dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() print("over!")
    #电影天堂 re爬虫
    import
    requests,re,csv url = "https://www.dytt8.net/" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers,verify=False) resp.encoding = 'gb2312' #指定字符集 #匹配ul的ui obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S) obj3 = re.compile(r'◎片  名 (?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S) result1 = obj1.finditer(resp.text) #保存子页面 list = [] for it in result1: ul = it.group('ul') #提取子页面链接 result2 = obj2.finditer(ul) for itt in result2: #子页面链接 url2 = url + itt.group('href').strip("/") list.append(url2) #print(url2) #提取子页面内容 for href in list: url2 = requests.get(href, headers=headers, verify=False) url2.encoding = 'gb2312' # 指定字符集 #print(url2.text) result3 = obj3.search(url2.text) print(result3.group("movie")) print(result3.group("download")) break
    #bs4爬虫
    import requests,re,csv
    from bs4 import BeautifulSoup
    url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml"
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    resp = requests.get(url,headers=headers)
    resp.encoding = 'utf-8' #指定字符集
    #写入文件
    f = open("菜价.csv",mode="w")
    csvwriter = csv.writer(f)
    #解析数据
    #1.把源代码交给beautifulSoup进行处理,生成bs对象
    page = BeautifulSoup(resp.text,"html.parser") #指定html解析
    #2.从bs对象查找数据
    #find(标签,属性=值)
    #find_all(标签,属性=值)
    #第一种写法
    #div = page.find("div",class_="m-r-main m-textLists")
    #class是python的关键字,这里用class_区分,防止报错
    #第二种写法
    div = page.find("div",attrs={"class":"m-r-main m-textLists"})
    #第二种可以避免class
    #拿到所有数据行tr
    trs = div.find_all("tr")[1:]  #[1:]做切片 从第1个开始
    for tr in trs: #每行数据
        tds = tr.find_all("td") #每行的td
        name = tds[0].text
        class1 = tds[1].text
        high = tds[2].text
        avg = tds[3].text
        #print(name,class1,high,avg)
        csvwriter.writerow([name,class1,high,avg])
    f.close()
    print("over!")
    #彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本
    import requests,re,csv,time
    from bs4 import BeautifulSoup
    url = "https://pic.netbian.com/4kmeinv/"
    url1 = "https://pic.netbian.com"
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    resp = requests.get(url,headers=headers)
    resp.encoding = 'gbk' #指定字符集
    #print(resp.text)
    #解析数据
    page = BeautifulSoup(resp.text,"html.parser") #指定html解析
    #2.从bs对象查找数据
    #find(标签,属性=值)
    #find_all(标签,属性=值)
    div = page.find("div",class_="slist").find_all("a")
    #print(div)
    for a in div:
        href = url1+(a.get('href'))
        #print(href)
        #获取子页面源码
        resp2 = requests.get(href, headers=headers)
        resp2.encoding = 'gbk'  # 指定字符集
        page2 = BeautifulSoup(resp2.text, "html.parser")
        div2 = page2.find("div",class_="photo-pic")
        img = div2.find("img")
        src = url1+(img.get("src"))
        #print(src)
    #下载图片
        img_resp = requests.get(src)
        #img_resp.content #获取字节
        img_name = src.split("/")[-1]
        # 获取最后/的内容,举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg
        # 从中获取 102129-163037648996ad.jpg
        with open("img/"+img_name,mode="wb") as f:   #放入img文件夹
            f.write(img_resp.content) #图片内容写入文件
            f.close()
        print(img_name +" is Download  OK")
        time.sleep(0.5)
    print("OVER")
    #线程池+xpath提取
    import requests,re,csv,lxml
    from lxml import etree
    from concurrent.futures import ThreadPoolExecutor
    headers={
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
    }
    f = open("1.csv",mode="w",encoding="UTF-8")
    csvwriter = csv.writer(f)
    def page1(url):
        resp = requests.get(url,headers=headers)
        #resp.encoding = "UTF-8"  # 指定字符集
        #print(resp.text)
        html = etree.HTML(resp.text)
        table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0]
        #print(table)
        trs = table.xpath("./tr")
        #截取tr
        for tr in trs:
            txt = tr.xpath("./td/text()")
            #print(txt)
            #对数据做简单的处理
            txt = (item.replace("xa0","") for item in txt)
            #print(list(txt))
            #存放数据
            csvwriter.writerow(txt)
        print(url+"提取完成")
    if __name__ == '__main__':
        #page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html")
        #创建线程池
        with ThreadPoolExecutor(50) as t:     #500个线程
            for i in range(11,99):              #200个任务
                #任务提交到线程池
                t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html")
        print("全部提取完成")
  • 相关阅读:
    Java如何编写自动售票机程序
    install windows service
    redis SERVER INSTALL WINDOWS SERVICE
    上传文件
    This problem will occur when running in 64 bit mode with the 32 bit Oracle client components installed.
    解决Uploadify上传控件加载导致的GET 404 Not Found问题
    OracleServiceORCL服务不见了怎么办
    Access to the temp directory is denied. Identity 'NT AUTHORITYNETWORK SERVICE' under which XmlSerializer is running does not have sufficient permiss
    MSSQL Server 2008 数据库安装失败
    数据库数据导出成XML文件
  • 原文地址:https://www.cnblogs.com/bingtang123/p/15374364.html
Copyright © 2011-2022 走看看