一,抓取新浪新闻的简单方法
1、requests
2、pyquery解析
3、自定义日志类logger.py
代码中分别使用了python多进程multiprocessing、gevent和普通的循环进行对比提取。

#!/usr/bin/python3 # -*- coding: utf-8 -*- import requests from pyquery import PyQuery as pq from logger import * from spiderDetail import * import time from multiprocessing import Pool import gevent.pool import gevent.monkey gevent.monkey.patch_all() sina_forex_url = 'http://finance.sina.com.cn/forex/' def get_index_pages(): response = requests.get(sina_forex_url) if response.status_code == 200: response.encoding = 'utf-8' content = response.text newsSet = set()#用于存储全部的新闻页面url,因新浪新闻可能出现在多个版块中,因此执行去重操作 # 获取头条新闻 hot_ad_link = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > h1 > a') hot_ad_url = hot_ad_link.attr('href') hot_ad_title = hot_ad_link.text() logger.debug('头条新闻') print(hot_ad_url) newsSet.add(hot_ad_url) # 获取焦点新闻列表 logger.debug('获取到焦点新闻') focus_news_lists = get_focus_news(content) for new in focus_news_lists: newsSet.add(new['url']) logger.debug(new) #获取滚动新闻 logger.debug('获取到滚动新闻') roll_news_lists = get_roll_news(content) for new in roll_news_lists: newsSet.add(new['url']) logger.debug(new) # 获取24新闻快递 logger.debug('获取到24小时新闻') hours24_news_lists = get_24hours_news(content) for new in hours24_news_lists: newsSet.add(new['url']) logger.debug(new) # 获取分析数据新闻 logger.debug('获取到分析数据') analysis_news_lists = get_analysis_news(content) for new in analysis_news_lists: newsSet.add(new['url']) logger.debug(new) # 获取机构观点新闻 logger.debug('获取到机构观点') institution_opinion_news_lists = get_institution_opinion_news(content) for new in institution_opinion_news_lists: newsSet.add(new['url']) logger.debug(new) # 获取专家观点新闻 logger.debug('获取到专家观点') specialist_opinion_news_lists = get_specialist_opinion_news(content) for new in specialist_opinion_news_lists: newsSet.add(new['url']) logger.debug(new) # 获取人民币汇率新闻 logger.debug('获取到人民币汇率') rmb_exchange_news_lists = get_RMB_exchange_news(content) for new in rmb_exchange_news_lists: newsSet.add(new['url']) logger.debug(new) #提取文章详细内容 logger.debug('抓取新闻共计:' + str(len(newsSet))) #采用基本的循环模式 #for url in newsSet: # get_page_detail(url) #采用多进程模式 #pool = Pool(5) #pool.map(get_page_detail,newsSet) #pool.close() #pool.join() #采用gevent多协程 pool = gevent.pool.Pool(5) data = pool.map(get_page_detail,newsSet) return len(newsSet) else: logger.info('请求新浪外汇首页失误') #获取焦点新闻 def get_focus_news(content): focus_news_list = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > div.ListB > ul > li') for li in focus_news_list.items(): yield { 'title':li.text(), 'url':li('a').attr('href') } #获取滚动新闻 def get_roll_news(content): roll_news_list = pq(content)('#wrap > div:nth-child(25) > div.Center > div.ListB > ul > li') for li in roll_news_list.items(): yield { 'title':li.text(), 'url':li('a').attr('href') } #获取24新闻快递 def get_24hours_news(content): roll_news_list = pq(content)('#wrap > div.PartA.Top10 > div.CenterB > div.ListB.ListE > ul > li') for li in roll_news_list.items(): yield { 'title':li('a').text() + li('span').text(), 'url':li('a').attr('href') } #获取分析数据新闻 def get_analysis_news(content): roll_news_list = pq(content)('#wrap > div:nth-child(28) > div.Center > div.ListE > ul > li') for li in roll_news_list.items(): yield { 'title':li('a').text() + li('span').text(), 'url':li('a').attr('href') } #获取机构观点新闻 def get_institution_opinion_news(content): roll_news_list = pq(content)('#wrap > div:nth-child(29) > div.Center > div.ListE > ul > li') for li in roll_news_list.items(): yield { 'title':li('a').text() + li('span').text(), 'url':li('a').attr('href') } #获取专家观点新闻 def get_specialist_opinion_news(content): roll_news_list = pq(content)('#wrap > div:nth-child(30) > div.Center > div.ListE > ul > li') for li in roll_news_list.items(): yield { 'title':li('a').text() + li('span').text(), 'url':li('a').attr('href') } #获取人民币汇率新闻 def get_RMB_exchange_news(content): roll_news_list = pq(content)('#wrap > div:nth-child(31) > div.Center > div.ListE > ul > li') for li in roll_news_list.items(): yield { 'title':li('a').text() + li('span').text(), 'url':li('a').attr('href') } #get_index_pages()

#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests from requests.exceptions import RequestException from pyquery import PyQuery as pq import re from logger import * from DBHelper import * from hashlib import md5 import json from bs4 import BeautifulSoup as bs def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: response.encoding = 'utf-8' content = response.text pqContent = pq(content) title = pqContent('#artibodyTitle').text() print(title) date = pqContent('#wrapOuter > div > div.page-info > span').text()[:16] print(date) cnt = bs(content,"lxml") body = cnt.find(id='artibody') blockquote = cnt.find('blockquote') if blockquote: new_tag = cnt.new_tag("<b>") new_tag.string = "<a href='www.mysite.com'>替换成自己网站的名称和地址</a>" body.blockquote.replace_with(new_tag) #print(str(body)) articleContent = pq(''.join(str(body))) #print(type(articleContent)) #print(articleContent) if body: # 获取正文内容 regex = re.compile('<!-- 原始正文start -->(.*)<!-- 原始正文end -->',re.S) match = re.findall(regex,str(body)) if match: match = match[0].strip() images = pq(match)('img') for img in images: img_name = get_page_img(pq(img).attr('src')) if img_name: r = re.subn(pq(img).attr('src'), 'img/' + img_name, match) match = r[0] content_url = write_to_file(match, url) dict = {'title':title,'content':content_url,'date':date,'expired':'false'} insert(dict) else: logger.info('未能提取到文章正文:[%s]' % url) else: logger.info('未在该文章页面中查找到标签artibody:[%s]' % url) except RequestException: logger.info('请求文章正文出错',url) #获取文章内容页中的图片 def get_page_img(url): try: response = requests.get(url) if response.status_code == 200: actual_img_path = save_image(response.content) return actual_img_path else: return None except RequestException: logger.info('请求图片出错',url) return None #保存文章页中的全部插图 def save_image(content): img_folder = os.path.join(os.getcwd(), 'img') img_name = md5(content).hexdigest() img_path = '{0}/{1}.{2}'.format(img_folder,img_name,'jpg') if not os.path.exists(img_path): with open(img_path,'wb') as f: f.write(content) f.close() return img_name+'.jpg' else: return img_name # 将正文内容保存至文件中 def write_to_file(content,url): content_folder = os.path.join(os.getcwd(),'files') file_name = md5(url.encode('utf-8')).hexdigest() file_path = '{0}/{1}.{2}'.format(content_folder, file_name,'txt') if not os.path.exists(file_path): with open(file_path,'w',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)) f.close() logger.info('文件正文保存成功---新浪地址url:'+ url) return file_name else: return file_name

#!usr/bin/python3 # -*- coding: utf-8 -*- import os import logging import time logger = logging.getLogger() logger.setLevel(logging.DEBUG) #按天保存记录日志 logFile = './log/log_{0}.txt'.format(time.strftime("%Y%m%d",time.localtime())) # if not os.path.exists(logFile): # os.mknod(logFile) fh = logging.FileHandler(logFile,mode='a') fh.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s:%(message)s") fh.setFormatter(formater) ch.setFormatter(formater) logger.addHandler(fh) logger.addHandler(ch)
其它的类,Models.py,DBHelper.py,config.py

#!/usr/bin/python3 # -*- coding: utf-8 -*- import pymysql from config import * from logger import * # #扩展功能 # def insert(article): db = pymysql.connect(host=HOST,port=POST,user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8',use_unicode=True) cursor = db.cursor() sql = """insert into articles(title,content,date,expired) values('%s','%s','%s','%s')""" % (article['title'],article['content'],article['date'],article['expired']) try: cursor.execute(sql) db.commit() logger.info('插入文章记录成功,执行命令[' + sql + ']') except: logger.error('文章记录插入错误,执行命令[' + sql + ']') db.rollback() db.close()

#database config items HOST = '127.0.0.1' POST = 6000 #数据库端口号,我更改为了6000 DATABASE = '数据库名' USERNAME = '数据库账号' PASSWORD = '数据库密码'

#!usr/bin/python3 # -*- coding: utf-8 -*- class Article: ID = None, Title = '', SubTitle = '', Summary = '', Content = '', Date = '', Author = '', ForumID = 0, StickyPost = 'false', Expired = 'false'
二、配置nginx
请求访问入口app.py(采用nginx+uwsgi)

#!/usr/bin/env python3 # -*- coding: utf-8 -*- from sinaForex import * import time from logger import * def application(env,start_response): start_response('200 ok',[('Content_Type','text/html')]) s = time.time() len = get_index_pages() e = time.time() logger.info("-----------------共爬取新闻{0}条,耗时:{1}-----------------".format(len,round(e-s,3))) rst = "共爬取新闻{0}条,耗时:{1}".format(len,round(e-s,3)) print(time.localtime(time.time())) return [b'%s' % rst.encode('utf-8')]
server {
listen 80;
root /www/web/sina_forex;
server_name py.mysite.com;
index index.html index.php index.htm;
error_page 400 /errpage/400.html;
error_page 403 /errpage/403.html;
error_page 404 /errpage/404.html;
error_page 503 /errpage/503.html;
location /spider {
uwsgi_pass 127.0.0.1:8001;
include uwsgi_params;
}
location / {
try_files $uri @apache;
}
}

[uwsgi] socket = :8001 #web服务端口 chdir = /www/web/sina_forex #网站根目录 wsgi-file = app.py #请求处理类 vhost = true master = true processes = 5 pidfile = /www/web/sina_forex/uwsgi8001.pid daemonize = /www/web/sina_forex/log/uwsgi8001.log
这样web服务即可正常获取,根据nginx的访问规则http://py.mysite.com/spider
三、定时收集新闻
写一个定时执行的类autoSpiderTimer.py,来定时执行web请求,而进程的管理采用supervisor

#!/usr/bin/env python # -*- coding: utf-8 -*- import requests import time def timer(): response = requests.get('http://py.mysite.com/spider') print(response.text) print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) time.sleep(3600*24) timer() #每天定时收集一次