zoukankan      html  css  js  c++  java
  • 使用python抓取CSDN关注人的全部公布的文章

    # -*- coding: utf-8 -*-
    """
    @author: jiangfuqiang
    """
    import re
    import urllib2
    import cookielib
    import time
    
    def startParser(author,page=1):
        reg = r'<a href="/w+/article/details/d+">s*	*
    *s*	*s*.*?	*
    *	*s*</a>'
        cj = cookielib.LWPCookieJar()
        cookie_support = urllib2.HTTPCookieProcessor(cj)
        opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
               'Referer' : '    http://my.csdn.net/my/favorite'}
        flag = True
        while flag == True:
            time.sleep(2)
            url = "http://blog.csdn.net/%s/article/list/%d"%(author,page)
            req = urllib2.Request(url,headers=headers)
            resp = urllib2.urlopen(req)
            data = resp.read()
            regex = re.compile(reg,flags=re.MULTILINE)
            result = regex.findall(data)
    
            for rd in result:
                print rd
            if len(result) < 20:
                flag = False
            page = page + 1
        print 'success............page:%d'%page
        #print result.group()
    
    if __name__ == '__main__':
    
        startParser('yiyaaixuexi',1)
    
    这篇python抓取收藏的文章链接和标题中有python发送邮件的代码,能够将此程序略微修改之后将文章链接发送的邮箱以便以后查阅

    
    
    
    
  • 相关阅读:
    关于 Bellman-Ford 与 Floyd 算法的一点感想
    中途相遇法 解决 超大背包问题 pack
    具体一些的博弈论 sqrstone
    SG函数学习总结
    mc
    string
    积木大赛
    pta l3-20(至多删三个字符)
    pta l3-7(天梯地图)
    ucore-lab1-练习2report
  • 原文地址:https://www.cnblogs.com/gcczhongduan/p/4283057.html
Copyright © 2011-2022 走看看