zoukankan      html  css  js  c++  java
  • 42、lucene和机器学习进行全文搜索,并排序

    package com.lucene.test;
    
    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.nio.file.Paths;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.joone.engine.FullSynapse;
    import org.joone.engine.LinearLayer;
    import org.joone.engine.Monitor;
    import org.joone.engine.NeuralNetEvent;
    import org.joone.engine.NeuralNetListener;
    import org.joone.engine.SigmoidLayer;
    import org.joone.engine.learning.TeachingSynapse;
    import org.joone.io.MemoryInputSynapse;
    import org.joone.io.MemoryOutputSynapse;
    import org.joone.net.NeuralNet;
    import org.junit.Test;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import com.lucene.domain.Article;
    
    public class TestLucene implements NeuralNetListener{
        private NeuralNet nnet = null;
        private MemoryInputSynapse inputSynapse,desireOutputSynapse;
        LinearLayer input;
        SigmoidLayer hidden,output;
        boolean singleThreadMode = true;
        
        //XOR input
        private double[][] inputArray = new double[][]{
                {0.0,0.0},
                {0.0,1.0},
                {1.0,0.0},
                {1.0,1.0}
        };
        
        //XOR desired output
        private double[][] desiredOutputArray = new double[][]{
                {0.0},
                {1.0},
                {1.0},
                {1.0}
        };
        
        /**
         * 创建索引
         * @throws Exception
         */
        @Test
        public void testCreateIndex() throws Exception{
            int fileNum = 1;
            List<String> contents = new ArrayList<String>();
            InputStream inputStream = null;
            String value = null;
            File directory = new File("./20_newsgroups");
            if(directory.isDirectory()){
                File[] files = directory.listFiles();
                for (int i = 0; i < 1; i++) {
                    if(files[i].isDirectory()){
                        File[] subFiles = files[i].listFiles();
                        for (int j = 0; j < 10; j++) {
                            inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));
                            StringBuffer tempContent = new StringBuffer();
                            byte[] bytes = new byte[1024*10];
                            int len = 0;
                            while((len = inputStream.read(bytes))!=-1){
                                tempContent = tempContent.append(new String(bytes));
                            }
                            value = tempContent.toString();
                            System.out.println(value);
                            inputStream.close();
                            Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());
                            Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));
                            //分词器
                            Analyzer analyzer = new WhitespaceAnalyzer();
                            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
                            Document doc = new Document();
                            doc.add(new TextField("id", article.getId().toString(), Store.YES));
                            doc.add(new TextField("title", article.getTitle(), Store.YES));
                            doc.add(new TextField("content", article.getContent(), Store.YES));
                            IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);
                            System.out.println("have already add file to fileDocment system"+fileNum);
                            indexWriter.addDocument(doc);
                            indexWriter.close();//释放资源
                            fileNum = fileNum+1;
                        }
                    }
                }
            }
            
            //1.将需要添加的实体构造成实体对象
            Article article = new Article(1,"Lucene是全文检索框架",
                    "全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+
                    "全面,准确和快速是衡量全文检索系统的关键指标。");
            
            //2,保存到数据库(此步骤暂时省略)
            
            //3、建立索引(lucene)
            //索引库目录
            
            
            
            //将 Article 转换为Document
            
            
            
            
            //保存到索引库中
            
        }
        
        /**
         * 测试搜索
         * @throws IOException 
         * @throws ParseException 
         */
        @Test
        public void testSearch() throws IOException, ParseException{
            //1、搜索条件
            String queryCondition = "philosophical";
            
            //2、执行搜索(lucene)
            List<Article> articles = new ArrayList<Article>();
            
            //----------搜索代码------------------------
            Directory directory = FSDirectory.open(Paths.get("./indexDir/"));
            Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器
            
            //把查询字符串转换为Query对象(只在title中查询)
            QueryParser queryParser = new QueryParser("content",analyzer);
            Query query = queryParser.parse(queryCondition);
            
            //2执行搜索得到搜索结果
            IndexReader indexReader = DirectoryReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
            TopDocs topDocs = indexSearcher.search(query, 100);
            
            Integer count = topDocs.totalHits;//总结果数量
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果
            
            //2.3处理结果
            for (int i = 0; i < scoreDocs.length; i++) {
                ScoreDoc scoreDoc= scoreDocs[i];
                int docId = scoreDoc.doc;
                System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId);
                
                //根据内部编号取出真正的Document数据
                Document doc = indexSearcher.doc(docId);
                
                //将document转化为Article
                Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));
                articles.add(article);
            }
            
            //------------------------------------------
            //3、控制台显示结果
            System.err.print("总结果数:"+count);
            for (Article article : articles) {
                System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle());
            }
            indexSearcher.getIndexReader().close();
        }
        
        
        @Test
        public void testNeuralNet(){
            TestLucene testLucene = new TestLucene();
            testLucene.initNeuralNet();
            testLucene.train();
            testLucene.interrogate();
        }
        
        public void initNeuralNet(){
            //First create the three layers
            input = new LinearLayer();
            hidden = new SigmoidLayer();
            output = new SigmoidLayer();
            
            //set the dimensions of the layers
            input.setRows(2);
            hidden.setRows(3);
            output.setRows(1);
            
            input.setLayerName("L.input");
            hidden.setLayerName("L.hidden");
            output.setLayerName("L.output");
            
            //Now create the two Synapses
            FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn
            FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn
            
            //Connect the input layer whit the hidden layer
            input.addOutputSynapse(synapse_IH);
            hidden.addInputSynapse(synapse_IH);
            
            //Connect the hidden layer whit the output layer
            hidden.addOutputSynapse(synapse_HO);
            output.addInputSynapse(synapse_HO);
            
            //the input to the neural net
            inputSynapse = new MemoryInputSynapse();
            input.addInputSynapse(inputSynapse);
            
            //The Trainer and its desired output
            desireOutputSynapse = new MemoryInputSynapse();
            TeachingSynapse trainer = new TeachingSynapse();
            
            trainer.setDesired(desireOutputSynapse);
            
            //Now we add this structure to a NeuralNet object
            nnet = new NeuralNet();
            
            nnet.addLayer(input,NeuralNet.INPUT_LAYER);
            nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);
            nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);
            nnet.setTeacher(trainer);
            output.addOutputSynapse(trainer);
            nnet.addNeuralNetListener(this);
        }
        
        public void train(){
            //set the inputs
            inputSynapse.setInputArray(inputArray);
            inputSynapse.setAdvancedColumnSelector("1,2");
            //set the desired outputs
            desireOutputSynapse.setInputArray(desiredOutputArray);
            desireOutputSynapse.setAdvancedColumnSelector("1");
            //get the monitor object to train or feed forward
            Monitor monitor = nnet.getMonitor();
            
            //set the monitor parameters
            monitor.setLearningRate(0.8);
            monitor.setMomentum(0.3);
            monitor.setTrainingPatterns(inputArray.length);
            monitor.setTotCicles(5000);
            monitor.setLearning(true);
            
            long initms = System.currentTimeMillis();
            //Run the network in single-thread,synchronized mode
            nnet.getMonitor().setSingleThreadMode(singleThreadMode);
            nnet.go(true);
            System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");
        }
    
        
        public void interrogate(){
            double[][] inputArray = new double[][]{
                    {0.0,1.0},
                    {1.0,0.0},
                    {1.0,1.0},
                    {0.0,0.0}
            };
            //set the inputs
            inputSynapse.setInputArray(inputArray);
            inputSynapse.setAdvancedColumnSelector("1,2");
            Monitor monitor = nnet.getMonitor();
            monitor.setTrainingPatterns(4);
            monitor.setTotCicles(1);
            monitor.setLearning(false);
            MemoryOutputSynapse memOut = new MemoryOutputSynapse();
            //set the output synapse to write the output of the net
            
            if(nnet != null){
                nnet.addOutputSynapse(memOut);  
                System.out.println(nnet.check());
                nnet.getMonitor().setSingleThreadMode(singleThreadMode);
                nnet.go();
                for (int i = 0; i < 4; i++) {
                    double[] pattern = memOut.getNextPattern();
                    System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);
                }
                System.out.println("Interrogating Finished");
            }
        }
        
        
        public void cicleTerminated(NeuralNetEvent arg0) {
            
        }
    
        public void errorChanged(NeuralNetEvent e) {
            Monitor mon=(Monitor) e.getSource();
            if(mon.getCurrentCicle()%100==0){
                System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"
                        +mon.getGlobalError());
            }
        }
    
        public void netStarted(NeuralNetEvent e) {
            Monitor mon = (Monitor) e.getSource();
            System.out.println("Network started for ");
            if(mon.isLearning()){
                System.out.println("training");
            }else{
                System.out.println("interrogation.");
            }
        }
    
        public void netStopped(NeuralNetEvent e) {
            Monitor mon = (Monitor) e.getSource();
            System.out.println("Network stopped . Last RMSE="
                    +mon.getGlobalError());
        }
    
        public void netStoppedError(NeuralNetEvent e, String error) {
            System.out.println("Network stopped due the following error:"
                    +error);
        }
    }

    结果

    得分是:0.25462872内部编号是:7840
    得分是:0.24006625内部编号是:7841
    查询结果:ID为:2,title为:51060总结果数:2
    查询结果:ID为:1,title为:49960
  • 相关阅读:
    test3
    test2
    test
    移动布局之流式布局
    移动端布局
    如何将本地项目上传到Github上
    node: 使用res.send()时报错RangeError [ERR_HTTP_INVALID_STATUS_CODE]: Invalid status code: *
    字体图标
    pip install Wikipedia2Vec时报错:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 41: invalid continuation byte
    tensorflow与keras版本不匹配问题
  • 原文地址:https://www.cnblogs.com/weizhen/p/6163127.html
Copyright © 2011-2022 走看看