zoukankan      html  css  js  c++  java
  • 定向爬虫小例子

    demo下载 java  和 python

    # --*-- coding:utf-8 --*--
    import urllib2
    from lxml import etree
    import Queue
    import time
    import os
    
    
    def getHtml(url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
        doc = urllib2.urlopen(request, timeout=45).read().decode('gbk')
        return doc
    
    seed = 'http://it.dataguru.cn/'
    seed = 'http://bi.dataguru.cn/'
    seed = 'http://science.dataguru.cn/'
    que_urls = Queue.Queue()
    que_urls.put(seed)
    
    
    def getCurTimeStamp(root='/data/data/dataguru/science/'):
        """
        获取当前时间戳:离1970年1月1日午夜开始的毫秒数
        :return:
        """
        return root + str(int(time.time() * 1000)) + '.txt'
    
    
    def start():
        while que_urls.qsize() > 0:
            url = que_urls.get()
            html = getHtml(url)
            dom = etree.HTML(html)
            # links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']/@href")
            links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']")
            print len(links)
            for lk in links:
                print lk.text, lk.xpath('./@href')
                try:
                    link = lk.xpath('./@href')[0]
                    html_c = getHtml(link)
                    dom_c = etree.HTML(html_c)
                    article = dom_c.xpath('//td[@id="article_content"]//text()')
                    content = os.linesep.join(article)
                    content = content.replace('
    ', '')
                    with open(getCurTimeStamp(), 'wb') as mf:
                        mf.write(link + os.linesep)
                        mf.write(lk.text.encode('utf-8') + os.linesep)
                        mf.write(content.encode('utf-8'))
                except Exception, e:
                    print e
                    continue
    
            links_next = dom.xpath('//div[@id="ct"]//a[@class="nxt"]')
            for lk in links_next:
                print lk.text, lk.xpath('./@href')
                que_urls.put(lk.xpath('./@href')[0])
    
    import jieba
    if __name__ == '__main__':
        #start()
        sen = '我来到北京清华大学'
        sen = '他来到了网易杭研大厦'
        seg_list = jieba.cut(sen, cut_all=False)
        res = "/ ".join(seg_list)
        print type(seg_list)
        print "Default Mode:", "/ ".join(seg_list)  # 精确模式
    package com.data.crawl.qa.baiduzhidao;

    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.LinkedList;
    import java.util.Queue;
    import java.util.regex.Pattern;

    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.htmlcleaner.HtmlCleaner;
    import org.htmlcleaner.TagNode;
    import org.htmlcleaner.XPatherException;

    public class Crawl {

        private static Log log = LogFactory.getLog(HttpClientPool.class);

        private HtmlCleaner cleaner = new HtmlCleaner();

        private HttpClientPool httpPool = new HttpClientPool();

        private Queue<String> queue = new LinkedList<String>();

        private Pattern Pat_index = Pattern.compile("http://zhidao.baidu.com/browse/\d+(\?pn=\d+#list)?");
        // http://zhidao.baidu.com/browse/82?pn=25#list
        // http://zhidao.baidu.com/browse/82?pn=50#list
        // http://zhidao.baidu.com/browse/82

        private Pattern Pat_content = Pattern.compile("http://zhidao.baidu.com/question/\d+.html\?entry=qb_browse_default");

        // http://zhidao.baidu.com/question/1732680699842305627.html?entry=qb_browse_default
        // http://zhidao.baidu.com/question/368440625636623924.html?entry=qb_browse_default
        // http://zhidao.baidu.com/question/1946360168489647948.html?entry=qb_browse_default

        public void start(String seed) {
            queue.add(seed);
            while (queue.size() > 0) {
                String uri = queue.poll();
                String html = httpPool.downHtml(uri);
                if (Pat_index.matcher(uri).find()) {
                    getOutlinks(html, uri);
                }else if(Pat_content.matcher(uri).find()){
                    getFields(html, uri);
                }else{
                    log.info("regex err: " + uri);
                }
            }
        }

        private void getFields(String html, String uri) {
            // TODO Auto-generated method stub
            
            TagNode doc = cleaner.clean(html);
            try {
                Object[] tags_title = doc.evaluateXPath("//span[@class='ask-title  ']");            
                String title = ((TagNode)tags_title[0]).getText().toString();
                log.info(title);
            } catch (XPatherException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
        }

        public static void main(String[] args) {

            Crawl crawl = new Crawl();
            String seed = "http://zhidao.baidu.com/browse/82";
            crawl.start(seed);
            log.info("complete");
        }

        public void getOutlinks(String html, String base) {
            TagNode doc = cleaner.clean(html);

            try {
                URL baseUrl = new URL(base);
                Object[] tags_content = doc.evaluateXPath("//a[@class='question-title']");
                for (Object object : tags_content) {
                    String relativeurl = ((TagNode) object).getAttributeByName("href");
                    URL url = new URL(baseUrl, relativeurl);
                    queue.add(url.toString());
                }
                
                Object[] tags_next = doc.evaluateXPath("//a[@class='pager-next']");
                String relative_url_next = ((TagNode) tags_next[0]).getAttributeByName("href");
                URL url = new URL(baseUrl, relative_url_next);
                queue.add(url.toString());

            } catch (XPatherException e) {
                log.warn(e.getMessage());
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
        }

    }


     
  • 相关阅读:
    mouse_event,keybd_event
    构造自己的StringGrid
    BMP2JPG,JPG2BMP
    $J的用法
    一个简单的系统托盘程序
    构造自己的双向链表
    C++的iostream标准库介绍
    D3DXVec4Dot 函数
    3D游戏从头编之D3D(3)
    D3DXVec4Cross 函数
  • 原文地址:https://www.cnblogs.com/i80386/p/3282064.html
Copyright © 2011-2022 走看看