zoukankan      html  css  js  c++  java
  • 爬虫实例

    # 爬取糗图上的图片

    import
    re import urllib.request import os def handler_request(url, page): url = url + str(page) + "/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } request = urllib.request.Request(url, headers=headers) return request def download_image(page, html): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } pattern = re.compile(r'<img src="(.*?)" alt=".*?" />') src_list = pattern.findall(html) dirs = os.path.join(os.getcwd(), "糗图") if not os.path.exists(dirs): os.makedirs(dirs) for i, src in enumerate(src_list): src = "https:" + src # print(src) file_name = os.path.join(dirs, "" + str(page) + "" + str(i) + ".jpg") print("图片%s开始下载..." % (str(page) + "" + str(i) + ".jpg")) try: request = urllib.request.Request(src, headers=headers) image = urllib.request.urlopen(request).read() except Exception as e: print("图片%s下载出错了" % (str(page) + "" + str(i) + ".jpg")) continue print("图片%s已经下载完毕" % (str(page) + "" + str(i) + ".jpg")) with open(file_name, "wb") as f: f.write(image) if __name__ == '__main__': url = "https://www.qiushibaike.com/pic/page/" start_page = int(input("请输入你想要查询的起始页:")) end_page = int(input("请输入你想要查询的结束页:")) for page in range(start_page, end_page + 1): print("第%s页开始下载..." % page) request = handler_request(url, page) content = urllib.request.urlopen(request).read().decode() download_image(page, content) print("第%s页已经下载完毕" % page) print() print() # print(content)
  • 相关阅读:
    Linux I2C驱动程序设计
    I2C裸机驱动程序设计
    Linux串口驱动程序设计
    Linux平台总线设备驱动
    Linux总线设备驱动模型
    Linux内核同步机制
    Linux设备驱动开发基础--阻塞型设备驱动
    模型评估方法和性能指标
    机器学习——XGBoost
    机器学习——GBDT
  • 原文地址:https://www.cnblogs.com/nxrs/p/11335241.html
Copyright © 2011-2022 走看看