zoukankan      html  css  js  c++  java
  • 使用requests+pyquery爬取dd373地下城跨五最新商品信息

    废话不多说直接上代码:

      可以使用openpyel库对爬取的信息写入Execl表格中代码我就不上传了

    import requests
    from urllib.parse import urlencode
    from requests import RequestException
    from pyquery import PyQuery as pq
    
    def open_sh():
        #获取dd373html信息
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        data = {
            "minPrice":333,
            "maxPrice":""
        }
        url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-0.html?"+urlencode(data)
        try:
            response = requests.get(url,headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print("链接错误",url)
            return None
    
    def doc_page(html):
        # 获取地下城账号信息
        doc = pq(html)
        content = doc("div.content")
        titleText = content.find(".box.money_ner").items()
        for items in titleText:
            product = {
                "地址":items.find("a.titleText").attr("href"),
                "账号信息":items.find("a.titleText").text(),
                "价格":items.find("div.money_text strong span").text()+'元',
                "是否存在":items.find("div.num.left").text()
            }
            print(product)
    def page_sh(pagebox):
        # 循环遍历所有分页
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        data = {
            "minPrice": 333,
            "maxPrice": ""
        }
        for page in range(1,pagebox+1):
            url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-%s.html?%s"%(page,urlencode(data))
            try:
                page1 = page_currentpage(url)
                if page1==page:
                    response = requests.get(url, headers=headers)
                    if response.status_code == 200:
                        doc_page(response.text)
            except Exception as e:
                raise e
    
    def page_currentpage(html):
        # 获取分页中被高亮的页数用于判断是否在 当前页面
        doc = pq(html)
        currentpage= doc("a.nb.currentpage").text()
        return int(currentpage)
    
    def page_box(html):
        # 获取所有的页码
        doc = pq(html)
        pagebox = doc(".pagebox.clear ul li.yeshu").text()[9:-1]
        return int(pagebox)
    
    def main():
        html = open_sh()
        page = page_box(html)
        page_sh(page)
    
    
    
    if __name__ == "__main__":
        main()
    

      

  • 相关阅读:
    按字母分类的产品组件
    container中的内容 垂直-水平居中
    安卓手机--键盘谈起后 fixed背景图片被键盘顶起的问题
    清除样式的css
    vue 路由(二级)配置及详细步骤
    vue 路由 URL传参
    路由表的组件群
    vue 路由传参
    vue 路由入门(vue-router)
    jQuery对文档的操作
  • 原文地址:https://www.cnblogs.com/zhmiao/p/10684570.html
Copyright © 2011-2022 走看看