zoukankan      html  css  js  c++  java
  • 使用python抓取CSDN关注人的全部公布的文章

    # -*- coding: utf-8 -*-
    """
    @author: jiangfuqiang
    """
    import re
    import urllib2
    import cookielib
    import time
    
    def startParser(author,page=1):
        reg = r'<a href="/w+/article/details/d+">s*	*
    *s*	*s*.*?	*
    *	*s*</a>'
        cj = cookielib.LWPCookieJar()
        cookie_support = urllib2.HTTPCookieProcessor(cj)
        opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
               'Referer' : '    http://my.csdn.net/my/favorite'}
        flag = True
        while flag == True:
            time.sleep(2)
            url = "http://blog.csdn.net/%s/article/list/%d"%(author,page)
            req = urllib2.Request(url,headers=headers)
            resp = urllib2.urlopen(req)
            data = resp.read()
            regex = re.compile(reg,flags=re.MULTILINE)
            result = regex.findall(data)
    
            for rd in result:
                print rd
            if len(result) < 20:
                flag = False
            page = page + 1
        print 'success............page:%d'%page
        #print result.group()
    
    if __name__ == '__main__':
    
        startParser('yiyaaixuexi',1)
    
    这篇python抓取收藏的文章链接和标题中有python发送邮件的代码,能够将此程序略微修改之后将文章链接发送的邮箱以便以后查阅

    
    
    
    
  • 相关阅读:
    项目总结升级2
    项目总结升级1
    项目总结升级
    项目总结4
    项目总结3
    体温填报app2.0开发
    每日博客
    第一周开课博客
    学习日报
    学习日报
  • 原文地址:https://www.cnblogs.com/gcczhongduan/p/4283057.html
Copyright © 2011-2022 走看看