zoukankan html css js c++ java

随笔写一个简单的爬虫

目标：爬取damai网上即将上演的信息

 1 #!/usr/bin/python
 2 # -*- coding: utf-8 -*-
 3 
 4 import requests, re
 5 from bs4 import BeautifulSoup
 6 
 7 DOWNLOAD_URL = "http://www.damai.cn/bj/"
 8 
 9 #获取url页面内容
10 def download_page(url):
11     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
12                         'Chrome/51.0.2704.63 Safari/537.36'}
13     data = requests.get(url, headers = headers).content
14     return data
15 #解析html 
16 def x_page(url):
17     soup = BeautifulSoup(url, 'html.parser')
18     li_lst = soup.find('div', class_='index-con').next_sibling.next_sibling.find_all('li')
19     titles = [i.find('dt').find('a').string for i in li_lst]
20     prices = map(lambda x: x.find('p',class_='price').find('strong').text if x.find('p',class_='price').find('strong') is not None else 'none',li_lst)
21     time = [i.find('p',class_='time').string for i in li_lst]
22     places = [i.find('p',class_='place').string for i in li_lst]
23     return titles,prices,time,places
24 
25 if __name__ == '__main__':
26     url = download_page(DOWNLOAD_URL)
27     titles, prices, time, places = x_page(url)
28     info_lst = zip(titles,prices,time,places)
29     #写入文件
30     with open('damai.txt','w+') as f:
31         for j in info_lst:
32             f.write(' '.join(j)+'

')

查看全文

相关阅读:
Scan image with TWAIN scanner and insert into Rich Text (R5/Win32)
软件测试工具汇总
 domino升级602>651
domino SMTP验证LDAPPOP3的实现
 domino升级602>651>851
DOMINO中的内置域
 Attaching and importing image files in one click
传西门子中国运营中近一半业务涉及行贿沧海
 IT程序员：如何化蛹为蝶？沧海
 年度个人职业规划秘笈沧海

原文地址：https://www.cnblogs.com/fuzzier/p/5929453.html