zoukankan      html  css  js  c++  java
  • 爬取电商售卖信息

     1 #! /usr/bin/env python
     2 # encoding='utf-8'
     3 # Filename:spider_58center_sth.py
     4 
     5 from bs4 import BeautifulSoup
     6 import time
     7 import requests
     8 
     9 url_58 = 'http://nj.58.com/?PGTID=0d000000-0000-0c5c-ffba-71f8f3f7039e&ClickID=1'
    10 
    11 '''
    12 用于爬取电商售卖信息:例为58同城电脑售卖信息'''
    13 
    14 
    15 def get_url_list(url):
    16     web_data = requests.get(url)
    17     soup = BeautifulSoup(web_data.text, 'lxml')
    18     url = soup.select('td.t > a[class="t"]')
    19     url_list = ''
    20     for link in url:
    21         link_n = link.get('href')
    22         if 'zhuanzhuan' in link_n:
    23             pass
    24         else:
    25             if 'jump' in link_n:
    26                 pass
    27             else:
    28                 url_list = url_list + '
    ' + link_n
    29 
    30     print('url_list: %s' % url_list)
    31     return url_list
    32 
    33 
    34 # 分类获取目标信息
    35 def get_url_info():
    36     url_list = get_url_list(url_58)
    37 
    38     for url in url_list.split():
    39         time.sleep(1)
    40         web_datas = requests.get(url)
    41         soup = BeautifulSoup(web_datas.text, 'lxml')
    42 
    43         type = soup.select('#head > div.breadCrumb.f12 > span:nth-of-type(3) > a')
    44         title = soup.select(' div.col_sub.mainTitle > h1')
    45         date = soup.select('li.time')
    46         price = soup.select('div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.summary > ul > '
    47                             'li:nth-of-type(1) > div.su_con > span.price.c_f50')
    48         fineness = soup.select('div.col_sub.summary > u1 > li:nth-of-type(2) > div.su_con > span')
    49         area = soup.select('div.col_sub.summary > u1 > li:nth-of-type(3) > div.su_con > span')
    50 
    51         for typei, titlei, datei, pricei, finenessi, areai in zip(type, title, date, price, fineness, area):
    52 
    53             # 做字典
    54             data = {
    55                 'type': typei.get_text(),
    56                 'title': titlei.get_text(),
    57                 'date': datei.get_text(),
    58                 'price': pricei.get_text(),
    59                 'fineness': (finenessi.get_text()).strip(),
    60                 'area': list(areai.stripped_strings)
    61             }
    62             print(data)
    63 
    64     get_url_info()

    爬取商城商品售卖信息

  • 相关阅读:
    算法:POJ1008 Maya Calendar
    给我的十八岁
    算法:POJ1007 DNA sorting
    算法:POJ1006 三重峰值问题
    【树链剖分】洛谷P3384树剖模板
    【树链剖分】洛谷P3379 树链剖分求LCA
    【Tarjan缩点】PO3352 Road Construction
    【Dijkstra堆优化】洛谷P2243电路维修
    【Tarjan缩点】POJ2186 Popular Cows
    【最短路·差分约束】洛谷P1250
  • 原文地址:https://www.cnblogs.com/DeRozan/p/7660686.html
Copyright © 2011-2022 走看看