zoukankan      html  css  js  c++  java
  • python 简易小爬虫

    此脚本用于爬站点的下载链接,最终输出到txt文档中。

    如果是没有防盗链设置的站点,也可以使用脚本中的下载函数尝试直接下载。

    本脚本是为了短期特定目标设计的,如果使用它爬其它特征的资源链接需自行修改配置语句。

    python初学者,请多多指正。

    # -*- coding: utf-8 -*-  
    import re
    import urllib
    import os
    import urllib2
    import requests
    import time
    
    
    #download the file
    def download(page, url):
    	local_filename =url.split('/')[-1] + page + '.jpg'
    	r = requests.get(url, stream=True)
    	with open(local_filename, 'wb') as f:
    		for chunk in r.iter_content(chunk_size = 1024): 
    			if chunk: # filter out keep-alive new chunks
    				f.write(chunk)
                    f.flush() 
    
    	return local_filename
    
    
    #turn the data array into urls array
    def print_urls(urls):
    	output_urls = []
    	for link in urls:
    		start_link = link.find('"')
    		end_link = link.find('"', start_link+1)
    		output_link = link[start_link+1: end_link]
    		if output_link.find('http') == -1:
    			output_link = 'http://www.XXX.com' + output_link
    		if link.count('"') > 2:
    			continue
    		else:
    			output_urls.append(output_link)
    	return output_urls
    
    
    def output_download_link_page(page):
    	url = page
    	s = urllib.urlopen(url).read()
    	urls = []
    	img_urls = 'no image on' + page
    	new_stl_urls = []
    
    	title = re.findall(r'<h1>.+</h1>', s, re.I)
    	if len(title) != 0:
    		title = title[0]
    	else:
    		title = 'no title'
    
    	img_urls = print_urls(re.findall(r'href=".*?.jpg.*?"', s, re.I))
    	if len(img_urls) != 0:
    		img_urls = img_urls[0]
    	else:
    		img_urls = 'no image' + page	
    
    	stl_urls = print_urls (set(re.findall(r'href="/download/.*?"', s, re.I)))
    
    	for url in stl_urls:
    		#url = urllib2.urlopen(url).url
    		url = requests.get(url).url
    		new_stl_urls.append(url)
    
    	urls.append(title) 
    	urls.append(img_urls) 
    	urls = urls + new_stl_urls
    
    	return urls
    
    #print output_download_link_page('http://www.XXX.com/thing/46876')
    
    #output all links to download
    def output_all_pages(site):
    	s = urllib.urlopen(site).read()
    	page = re.findall(r'href="/thing/.*?"', s, re.I)
    	page = set(page)
    	return print_urls(page)
    
    
    #output all the sites to download
    def generate_sites(start, end):
    	sites = []
    	for  num in range(start, end):
    		sites.append('http://www.XXX.com/popular?query=&pg=' + str(num))
    	return sites
    
    
    #write all the results to a txt file
    file_new = open ('1.txt', 'r+')
    url_pakage = []
    sites = generate_sites(40, 46)
    count = 0
    
    for site in sites:
    	print site
    	file_new.write( '
    ' + site)
    	pages = output_all_pages(site)
    	for page in pages:
    		urls = output_download_link_page(page)
    		#
    		if len(urls) >= 10:
    			continue
    		count = count + 1
    		for url in urls:
    			file_new.write(url + '
    ')
    	print 'done'
    	time.sleep(10)
    
    file_new.close()
    print 'all done. all..' + str(count) + '..models'
    

      

      

    危楼高百尺,手可摘星辰。不敢高声语,恐惊天上人。
  • 相关阅读:
    java实现第六届蓝桥杯立方尾不变
    java实现第六届蓝桥杯立方尾不变
    java实现第七届蓝桥杯寒假作业
    java实现第六届蓝桥杯隔行变色
    java实现第六届蓝桥杯隔行变色
    java实现第七届蓝桥杯交换瓶子
    使用JOTM实现分布式事务管理(多数据源)
    分布式系统事务一致性解决方案(转)
    SpringMVC,Mybatis,FreeMarker连接mycat示例(一)
    从零开发分布式数据库中间件 二、构建MyBatis的读写分离数据库中间件
  • 原文地址:https://www.cnblogs.com/mingtan/p/6933755.html
Copyright © 2011-2022 走看看