zoukankan      html  css  js  c++  java
  • 01爬取当当网500本五星好评书籍

    # import requests,re,json
    # # 定义一个函数用来请求当当网的网页信息
    # def request_dangdang(url):
    # try:
    # # 使用get请求
    # response = requests.get(url)
    # # 判断返回的状态码是否为200
    # if response.status_code == 200:
    # # 返回接受的文本
    # return response.text
    # # 若访问出现错误,就返回空
    # except requests.RequestException:
    # return None
    # # 定义函数用来解析访问当当网收到的文本文件
    # def parse_text(html):
    # # 将正则表达式匹配符封装,以便多次使用
    # pattern = re.compile(
    # # 用正则表达式解析出网页上我们需要的书本名称信息
    # '<li>.*?list_num.*?(d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><spansclass="price_n">¥(.*?)</span>.*?</li>',
    # re.S
    # )
    # # 找出所有的书本信息
    # results = re.findall(pattern,html)
    # for result in results:
    # # 用生成器的方式生成数据
    # yield{
    # "range":results[0],
    # "image":results[1],
    # "title":results[2],
    # "recommend":results[3],
    # "author":results[4],
    # "times":results[5],
    # "price":results[6]
    # }
    # # 将解析到的数据写入文件中
    # def write_to_file(result):
    # print("准备开始,写入数据 ====>" + str(result))
    # with open("book.txt",'a',encoding = "utf-8") as f:
    # f.write(json.dumps(result,ensure_ascii=False) + ' ')
    # # 主函数
    # def main(page):
    # url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
    # # 获取当当网的页面
    # html = request_dangdang(url)
    # print("获取网页成功")
    # # 解析出我们想要的信息
    # results = parse_text(html)
    # print("解析信息成功")
    # # 然后将信息依次写入
    # for result in results:
    # write_to_file(result)
    # print("写入信息成功")
    #
    # if __name__ == "__main__":
    # for index in range(1,26):
    # print(index)
    # main(index)
    import requests
    import re
    import json


    def request_dandan(url):
    try:
    response = requests.get(url)
    if response.status_code == 200:
    return response.text
    except requests.RequestException:
    return None


    def parse_result(html):
    print("正则表达式")
    pattern = re.compile(
    # '<li>.*?list_num.*?(d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span>sclass="price_n">¥(.*?)</span>.*?</li>',
    '<li>.*?list_num.*?(d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span class="price_n">.yen;(.*?)</span>.*?</li>',
    re.S)
    items = re.findall(pattern, html)
    for item in items:
    yield {
    'range': item[0],
    'iamge': item[1],
    'title': item[2],
    'recommend': item[3],
    'author': item[4],
    'times': item[5],
    'price': item[6]
    }


    def write_item_to_file(item):
    print('开始写入数据 ====> ' + str(item))
    with open('book.txt', 'a', encoding='UTF-8') as f:
    f.write(json.dumps(item, ensure_ascii=False) + ' ')


    def main(page):
    url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
    html = request_dandan(url)
    # print(html)
    # print("请求网页成功")
    items = parse_result(html) # 解析过滤我们想要的信息
    # print("解析网页成功")
    for item in items:
    # print("开始写入数据")
    write_item_to_file(item)


    if __name__ == "__main__":
    # 循环爬取26页
    for i in range(1,26):
    main(i)
  • 相关阅读:
    超硬核Java工程师秋招回忆录+面经汇总,为了拿BAT的研发offer我都做了那些准备?
    使用Prometheus监控Golang服务-基于YoyoGo框架
    Kubernetes Pod OOM 排查日记
    Golang语言排序的几种方式
    设计公共组件需要注意什么
    有关WebSocket必须了解的知识
    docker容器技术
    【SpringBoot】 中时间类型 序列化、反序列化、格式处理
    安装Scrapy的时候报错error: Microsoft Visual C++ 14.0 is required.
    python爬虫学习05-爬取图片
  • 原文地址:https://www.cnblogs.com/cong12586/p/13221481.html
Copyright © 2011-2022 走看看