zoukankan      html  css  js  c++  java
  • 爬虫学习(十)——原始正则抓取数据案例

    糗事百科图片抓取案例

    打算发大水
    import os

    import re
    import time
    import urllib.request
    import urllib.parse

    # 输入目标页码和图片存储名
    def header():
    start_page = int(input("请输入起始页"))
    end_page = int(input("请输入结束页"))
    qiutu = input("请输入文件名字")
    # 对目标页码进行爬取
    for page in range(start_page,end_page+1):
    print("正在爬取第%s页"%page)
    request = headle_request(page)
    download(request,page,qiutu)
    # 设置时间间隔,防止网站识别为恶意攻击
    time.sleep(2)


    # 构建请求对象,拼接url
    def headle_request(page):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    url = "https://www.qiushibaike.com/pic/page/%s/?s=5167052"%page
    request = urllib.request.Request( url, headers=headers )
    return request


    # 根据请求对象下载指定的目标数据
    def download(request,qiutu):
    # 获取响应数据
    response = urllib.request.urlopen(request)
    # 创建文件存储的文件夹

    if not os.path.exists(qiutu):
    os.mkdir(qiutu)
    content = response.read().decode("utf8")
    # 正则表达式的编写,目标是获取图片的url【重点】
    img = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>',re.S)
    # 正则表达式匹配目标标签,
    # 返回的是一个列表
    ret = img.findall(content)
    for x in ret:
    img_url = "http:"+x[0]
    # 构建图片的文件名和格式
    filename = x[1]+".jpg"
    # 构建图片的存储路径
    image_path = os.path.join(qiutu,filename)
    urllib.request.urlretrieve(img_url,image_path)
    time.sleep(1.5)

    if __name__ == '__main__':
    header()



    励志网语录抓取案例

    import os
    import re
    import time
    import urllib.request
    import urllib.parse

    def main():
    start_page = int(input("请输入抓取的起始页:"))
    end_page = int(input("请输入抓取的结束页:"))
    for page in range(start_page,end_page+1):
    print("正在爬取第%d"%page)
    ret = request(page)
    content(ret)


    def request(page):
    headers = {"User - Agent": "Mozilla / 5.0( Windows NT 6.1;WOW64) AppleWebKit / 537.36( KHTML, likeGecko) Chrome / 72.0.3626.96Safari / 537.36"}
    url =" http://www.yikexun.cn/lizhi/qianming/list_50_%s.html"%page
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request).read().decode("utf8")
    pattern = re.compile(r'<div class="art-t">.*?<a href="(.*?)"><b>(.*?)</b></a>.*?</div>',re.S)
    ret = pattern.findall( response )
    return ret



    def content(ret):
    if not os.path.exists("励志语录1"):
    os.mkdir("励志语录1")
    for content in ret:
    title = content[1]+".html"
    article_url="http://www.yikexun.cn"+content[0]
    article_path = os.path.join( "励志语录1",title)
    response = urllib.request.urlopen(article_url)
    string = response.read().decode("utf8")
    regular =re.compile(r'(<div class="neirong">.*?<p>(.*?)</p>.*?</div>)',re.S)
    neirong = regular.findall(string)
    for info in neirong:
    cont = '<h1 style="color:blue">%s</h1> %s'%(content[1],info[0])
    with open(article_path,"w",encoding="utf8") as tf:
    tf.write(cont)
    tf.close()
    time.sleep(1)
    if __name__ == '__main__':
    main()


  • 相关阅读:
    P2073 送花(Treap维护双权值)
    P2041 [NOI2005]维护数列(Splay树支持插入区间、删除区间、修改区间、翻转区间、区间求和、区间带修改最大子列和的代码模板)
    P1801 黑匣子(Treap树)
    P3377 【模板】左偏树(可并堆)
    P1553 可怜的狗狗(可持久化线段树)
    P1503 鬼子进村(Treap树)
    Adobe CC 2017 全系列官方中文版32/64位
    Serverless简介
    小程序开发框架MPVue和uni-app
    ORM对象关系映射
  • 原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10374888.html
Copyright © 2011-2022 走看看