zoukankan      html  css  js  c++  java
  • 使用Python3.x抓取58同城(南京站)的演出票的信息

     1 #!/usr/bin/env python
     2 #-*-coding: utf-8 -*-
     3 import re
     4 import urllib.request as request
     5 from bs4 import BeautifulSoup as bs
     6 import csv
     7 import os
     8 import sys
     9 from imp import reload 
    10 reload(sys)
    11  
    12 def GetAllLink():
    13     num = int(input("爬取多少页:>"))
    14     if not os.path.exists('./data/'):
    15         os.mkdir('./data/')
    16      
    17     for i in range(num):
    18         if i+1 == 1:
    19             url = 'http://nj.58.com/piao/'
    20             GetPage(url, i)
    21         else:
    22             url = 'http://nj.58.com/piao/pn%s/' %(i+1)
    23             GetPage(url, i)
    24  
    25  
    26 def GetPage(url, num):
    27     Url = url
    28     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
    29     headers = { 'User-Agent' : user_agent }
    30     req = request.Request(Url, headers = headers)
    31     page = request.urlopen(req).read().decode('utf-8')
    32     soup = bs(page, "html.parser")
    33     table = soup.table
    34     tag = table.find_all('tr')
    35     # 提取出所需的那段
    36     soup2 = bs(str(tag), "html.parser")
    37     title = soup2.find_all('a','t')         #标题与url 
    38     price = soup2.find_all('b', 'pri')      #价格
    39     fixedprice = soup2.find_all('del')      #原价
    40     date = soup2.find_all('span','pr25')    #时间 
    41  
    42     atitle = []
    43     ahref = []
    44     aprice = []
    45     afixedprice = []
    46     adate = []
    47  
    48     for i in title:
    49         #print i.get_text(), i.get('href')
    50         atitle.append(i.get_text())
    51         ahref.append(i.get('href'))
    52     for i in price:
    53         #print i.get_text()
    54         aprice.append(i.get_text())
    55     for i in fixedprice:
    56         #print j.get_text()
    57         afixedprice.append(i.get_text())
    58     for i in date:
    59         #print i.get_text()
    60         adate.append(i.get_text())
    61 
    62     csvfile = open('./data/ticket_%s.csv'%num, 'w')
    63     writer = csv.writer(csvfile)
    64     writer.writerow(['标题','url','售价','原价','演出时间'])
    65     '''
    66     每个字段必有title,但是不一定有时间date
    67     如果没有date日期,我们就设为'---'
    68     '''
    69     if len(atitle) > len(adate):
    70         for i in range(len(atitle) - len(adate)):
    71             adate.append('---')
    72         for i in range(len(atitle) - len(afixedprice)):
    73             afixedprice.append('---')
    74         for i in range(len(atitle) - len(aprice)):
    75             aprice.append('---')
    76             
    77     for i in range(len(atitle)):
    78             message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]
    79             writer.writerow([i for i in str(message).split('|')])
    80     print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))
    81     csvfile.close()
    82  
    83  
    84 if __name__ == '__main__':
    85     GetAllLink()

    参考地址

  • 相关阅读:
    SGU 187 Twist and whirl
    伸展树---初步学习
    poj 2503 Babelfish
    sublime 3 phpfmt配置(大括号对齐)
    Linux Shell 错误: $' ': command not found错误解决
    redis 使用场景
    wireshake tcp 三次握手详解
    ip地址和子网掩码
    phpstorm 远程调式 php
    win10,ubuntu时间不对问题
  • 原文地址:https://www.cnblogs.com/yanduanduan/p/7286128.html
Copyright © 2011-2022 走看看