zoukankan html css js c++ java

提取网址的python练习

import urllib, urllib2, cookielib
from HTMLParser import HTMLParser
import sys

reload(sys)
sys.setdefaultencoding('utf8')

class WebParser(HTMLParser):
    def __init__(self, links, path):
        HTMLParser.__init__(self)
        self.links = links
        self.path = path

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            if len(attrs) == 0:
                pass
            else:
                for (key, val) in attrs:
                    if key == 'href':
                        if val.startswith('http'):
                            self.links.add(val)
                        elif val.startswith('/'):
                            self.links.add(self.path + val)

class Crawl:
    def __init__(self):
        self.path = 'http://www.baidu.com'
        self.cookie = cookielib.CookieJar()
        handler = urllib2.HTTPCookieProcessor(self.cookie)
        self.opener = urllib2.build_opener(handler)

    def open(self, path):
        self.response = self.opener.open(path)

    def showCookie(self):
        for item in self.cookie:
            print 'Name = ' + item.name
            print 'value = ' + item.value

    def showResponse(self):
        print self.response.read()

    def getAllUrl(self, links, path):
        try:
            self.open(path)
            res = self.response.read()
            parser = WebParser(links, path)
            parser.feed(res)
            parser.close()
        except Exception, e:
            print e

    def crawl(self):
        src_links = set()
        result_links = set()
        self.getAllUrl(src_links, self.path)
        n = 200
        while len(src_links) != 0 and n > 0:
            link = src_links.pop()
            if link in result_links:
                pass
            result_links.add(link)
            self.getAllUrl(src_links, link)
            n -= 1
            print n

        return result_links | src_links

c = Crawl()
rlt = c.crawl()
for link in rlt:
    print link

查看全文

相关阅读:
ASP.NET 弹出对话框和页面之间传递值的经验总结
 开拔(博)
Qt 报错：use of undeclared identifier
Qt http学习
 Qt 使用QJson生成解析json数据的方法
 Qt Http请求，post和get
Qt 使用QNetworkAccessManager实现Http操作
 一个截断多个数据库日志的的存储过程
 如何在程序中友好提示错误
 招聘网络工程师一名

原文地址：https://www.cnblogs.com/hushpa/p/4671144.html