zoukankan      html  css  js  c++  java
  • 42、lucene和机器学习进行全文搜索,并排序

    package com.lucene.test;
    
    import java.io.BufferedInputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.nio.file.Paths;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.joone.engine.FullSynapse;
    import org.joone.engine.LinearLayer;
    import org.joone.engine.Monitor;
    import org.joone.engine.NeuralNetEvent;
    import org.joone.engine.NeuralNetListener;
    import org.joone.engine.SigmoidLayer;
    import org.joone.engine.learning.TeachingSynapse;
    import org.joone.io.MemoryInputSynapse;
    import org.joone.io.MemoryOutputSynapse;
    import org.joone.net.NeuralNet;
    import org.junit.Test;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    import com.lucene.domain.Article;
    
    public class TestLucene implements NeuralNetListener{
        private NeuralNet nnet = null;
        private MemoryInputSynapse inputSynapse,desireOutputSynapse;
        LinearLayer input;
        SigmoidLayer hidden,output;
        boolean singleThreadMode = true;
        
        //XOR input
        private double[][] inputArray = new double[][]{
                {0.0,0.0},
                {0.0,1.0},
                {1.0,0.0},
                {1.0,1.0}
        };
        
        //XOR desired output
        private double[][] desiredOutputArray = new double[][]{
                {0.0},
                {1.0},
                {1.0},
                {1.0}
        };
        
        /**
         * 创建索引
         * @throws Exception
         */
        @Test
        public void testCreateIndex() throws Exception{
            int fileNum = 1;
            List<String> contents = new ArrayList<String>();
            InputStream inputStream = null;
            String value = null;
            File directory = new File("./20_newsgroups");
            if(directory.isDirectory()){
                File[] files = directory.listFiles();
                for (int i = 0; i < 1; i++) {
                    if(files[i].isDirectory()){
                        File[] subFiles = files[i].listFiles();
                        for (int j = 0; j < 10; j++) {
                            inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));
                            StringBuffer tempContent = new StringBuffer();
                            byte[] bytes = new byte[1024*10];
                            int len = 0;
                            while((len = inputStream.read(bytes))!=-1){
                                tempContent = tempContent.append(new String(bytes));
                            }
                            value = tempContent.toString();
                            System.out.println(value);
                            inputStream.close();
                            Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());
                            Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));
                            //分词器
                            Analyzer analyzer = new WhitespaceAnalyzer();
                            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
                            Document doc = new Document();
                            doc.add(new TextField("id", article.getId().toString(), Store.YES));
                            doc.add(new TextField("title", article.getTitle(), Store.YES));
                            doc.add(new TextField("content", article.getContent(), Store.YES));
                            IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);
                            System.out.println("have already add file to fileDocment system"+fileNum);
                            indexWriter.addDocument(doc);
                            indexWriter.close();//释放资源
                            fileNum = fileNum+1;
                        }
                    }
                }
            }
            
            //1.将需要添加的实体构造成实体对象
            Article article = new Article(1,"Lucene是全文检索框架",
                    "全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+
                    "全面,准确和快速是衡量全文检索系统的关键指标。");
            
            //2,保存到数据库(此步骤暂时省略)
            
            //3、建立索引(lucene)
            //索引库目录
            
            
            
            //将 Article 转换为Document
            
            
            
            
            //保存到索引库中
            
        }
        
        /**
         * 测试搜索
         * @throws IOException 
         * @throws ParseException 
         */
        @Test
        public void testSearch() throws IOException, ParseException{
            //1、搜索条件
            String queryCondition = "philosophical";
            
            //2、执行搜索(lucene)
            List<Article> articles = new ArrayList<Article>();
            
            //----------搜索代码------------------------
            Directory directory = FSDirectory.open(Paths.get("./indexDir/"));
            Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器
            
            //把查询字符串转换为Query对象(只在title中查询)
            QueryParser queryParser = new QueryParser("content",analyzer);
            Query query = queryParser.parse(queryCondition);
            
            //2执行搜索得到搜索结果
            IndexReader indexReader = DirectoryReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
            TopDocs topDocs = indexSearcher.search(query, 100);
            
            Integer count = topDocs.totalHits;//总结果数量
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果
            
            //2.3处理结果
            for (int i = 0; i < scoreDocs.length; i++) {
                ScoreDoc scoreDoc= scoreDocs[i];
                int docId = scoreDoc.doc;
                System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId);
                
                //根据内部编号取出真正的Document数据
                Document doc = indexSearcher.doc(docId);
                
                //将document转化为Article
                Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));
                articles.add(article);
            }
            
            //------------------------------------------
            //3、控制台显示结果
            System.err.print("总结果数:"+count);
            for (Article article : articles) {
                System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle());
            }
            indexSearcher.getIndexReader().close();
        }
        
        
        @Test
        public void testNeuralNet(){
            TestLucene testLucene = new TestLucene();
            testLucene.initNeuralNet();
            testLucene.train();
            testLucene.interrogate();
        }
        
        public void initNeuralNet(){
            //First create the three layers
            input = new LinearLayer();
            hidden = new SigmoidLayer();
            output = new SigmoidLayer();
            
            //set the dimensions of the layers
            input.setRows(2);
            hidden.setRows(3);
            output.setRows(1);
            
            input.setLayerName("L.input");
            hidden.setLayerName("L.hidden");
            output.setLayerName("L.output");
            
            //Now create the two Synapses
            FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn
            FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn
            
            //Connect the input layer whit the hidden layer
            input.addOutputSynapse(synapse_IH);
            hidden.addInputSynapse(synapse_IH);
            
            //Connect the hidden layer whit the output layer
            hidden.addOutputSynapse(synapse_HO);
            output.addInputSynapse(synapse_HO);
            
            //the input to the neural net
            inputSynapse = new MemoryInputSynapse();
            input.addInputSynapse(inputSynapse);
            
            //The Trainer and its desired output
            desireOutputSynapse = new MemoryInputSynapse();
            TeachingSynapse trainer = new TeachingSynapse();
            
            trainer.setDesired(desireOutputSynapse);
            
            //Now we add this structure to a NeuralNet object
            nnet = new NeuralNet();
            
            nnet.addLayer(input,NeuralNet.INPUT_LAYER);
            nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);
            nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);
            nnet.setTeacher(trainer);
            output.addOutputSynapse(trainer);
            nnet.addNeuralNetListener(this);
        }
        
        public void train(){
            //set the inputs
            inputSynapse.setInputArray(inputArray);
            inputSynapse.setAdvancedColumnSelector("1,2");
            //set the desired outputs
            desireOutputSynapse.setInputArray(desiredOutputArray);
            desireOutputSynapse.setAdvancedColumnSelector("1");
            //get the monitor object to train or feed forward
            Monitor monitor = nnet.getMonitor();
            
            //set the monitor parameters
            monitor.setLearningRate(0.8);
            monitor.setMomentum(0.3);
            monitor.setTrainingPatterns(inputArray.length);
            monitor.setTotCicles(5000);
            monitor.setLearning(true);
            
            long initms = System.currentTimeMillis();
            //Run the network in single-thread,synchronized mode
            nnet.getMonitor().setSingleThreadMode(singleThreadMode);
            nnet.go(true);
            System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");
        }
    
        
        public void interrogate(){
            double[][] inputArray = new double[][]{
                    {0.0,1.0},
                    {1.0,0.0},
                    {1.0,1.0},
                    {0.0,0.0}
            };
            //set the inputs
            inputSynapse.setInputArray(inputArray);
            inputSynapse.setAdvancedColumnSelector("1,2");
            Monitor monitor = nnet.getMonitor();
            monitor.setTrainingPatterns(4);
            monitor.setTotCicles(1);
            monitor.setLearning(false);
            MemoryOutputSynapse memOut = new MemoryOutputSynapse();
            //set the output synapse to write the output of the net
            
            if(nnet != null){
                nnet.addOutputSynapse(memOut);  
                System.out.println(nnet.check());
                nnet.getMonitor().setSingleThreadMode(singleThreadMode);
                nnet.go();
                for (int i = 0; i < 4; i++) {
                    double[] pattern = memOut.getNextPattern();
                    System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);
                }
                System.out.println("Interrogating Finished");
            }
        }
        
        
        public void cicleTerminated(NeuralNetEvent arg0) {
            
        }
    
        public void errorChanged(NeuralNetEvent e) {
            Monitor mon=(Monitor) e.getSource();
            if(mon.getCurrentCicle()%100==0){
                System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"
                        +mon.getGlobalError());
            }
        }
    
        public void netStarted(NeuralNetEvent e) {
            Monitor mon = (Monitor) e.getSource();
            System.out.println("Network started for ");
            if(mon.isLearning()){
                System.out.println("training");
            }else{
                System.out.println("interrogation.");
            }
        }
    
        public void netStopped(NeuralNetEvent e) {
            Monitor mon = (Monitor) e.getSource();
            System.out.println("Network stopped . Last RMSE="
                    +mon.getGlobalError());
        }
    
        public void netStoppedError(NeuralNetEvent e, String error) {
            System.out.println("Network stopped due the following error:"
                    +error);
        }
    }

    结果

    得分是:0.25462872内部编号是:7840
    得分是:0.24006625内部编号是:7841
    查询结果:ID为:2,title为:51060总结果数:2
    查询结果:ID为:1,title为:49960
  • 相关阅读:
    Springboot 之 自定义配置文件及读取配置文件
    SQLSERVER系统视图 sql server系统表详细说明
    MySQL Workbench建表时 PK NN UQ BIN UN ZF AI 的含义
    使用Ecplise git commit时出现"There are no stages files"
    maven添加sqlserver的jdbc驱动包
    java将XML文档转换成json格式数据
    java将XML文档转换成json格式数据
    cannot be resolved. It is indirectly referenced from required .class files
    org.codehaus.jackson.map.JsonMappingException: Can not construct instance of java.util.Date from String value '2012-12-12 12:01:01': not a valid representation (error: Can not parse date "2012-12-
    @Autowired注解和静态方法 NoClassDefFoundError could not initialize class 静态类
  • 原文地址:https://www.cnblogs.com/weizhen/p/6163127.html
Copyright © 2011-2022 走看看