zoukankan      html  css  js  c++  java
  • 文本倾向性分析

    package test;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Set;
    import java.util.Vector;
    
    
    
    public class OpinionAnalyser {
        //倾向词表
        public Vector <Word> words=new Vector <Word>();
        //修饰词表
        public Vector <Word> adjectives=new Vector <Word>();
        //描述词表
        public Vector <Word> descriptions=new Vector <Word>();
        //正面句子数
        public int posCount;
        //负面句子数
        public int negCount;
        
        static String SERVER="59.77.233.*";
        static String USER="";
        static String PASSWORD="";
        static String DATABASE="skycent";
        
        //负面词的权重,为2表示负面词是正面词权重的两倍
        static int NEG_WEIGHT=2;
        static int TITLE_WEIGHT=10;
        
    
        
        private static int atoi(String s)
        {
            return Integer.parseInt(s);
        }
        
        //读取数据库初始化三个词表和其他成员变量
        public void OpinionAnalyser() throws SQLException
        {
            ConnDB conndb;
            PreparedStatement stmt = null;
    //        PreparedStatement stmt = null;
            ResultSet rs = null;
            conndb = new ConnDB(SERVER, USER, PASSWORD, DATABASE);
            conndb.executeUpdate("SET NAMES 'utf8mb4'");
            
            //获取倾向性词表
            String strSQL = "select word,polar,weight from twordlist";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("polar"));
                int weight=atoi(rs.getString("weight"));
            //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,weight);
                words.addElement(tmp);    
            //    System.out.println(polar+" "+weight);
            }
            
            //获取描述词表
            strSQL = "select word,type from twordlist_ms";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist_ms没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("type"));
            //    System.out.println(polar+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,0);
                descriptions.addElement(tmp);    
            
            }
            
            //获取修饰词表
            strSQL = "select word,polar,weight from twordlist_xs";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist_xs没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("polar"));
                int weight=atoi(rs.getString("weight"));
            //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,weight);
                adjectives.addElement(tmp);    
            
            }
            posCount=0;
            negCount=0;
            
            conndb.close();
        }
        
        //句子倾向性得分
        public int sentenceScore(String sentence)
        {
            int opinionScore=0;
            //是否出现倾向词
            int opinionPosition=0;
            
            for(int i=0;i<words.size();i++)
            {
                //找到倾向性词表
                opinionPosition=sentence.indexOf(words.get(i).getWord());
            //    System.out.println(opinionPosition);
                
                if(opinionPosition!=-1)
                {
                    //是否出现修饰词+倾向词
                    int flag=0;
                    for(int j=0;j<adjectives.size();j++)
                    {
                        StringBuffer wordPair=new StringBuffer();
                        wordPair.append(adjectives.get(j).getWord());
                        wordPair.append(words.get(i).getWord());
                        int pairPosition =0;
                        pairPosition=sentence.indexOf(wordPair.toString());
                        
                        if(pairPosition!=-1)
                        {
                        //    System.out.println("yeyeyeyey");
                            flag=1;
                            int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                            
                            if(tmpScore>0)
                                opinionScore +=tmpScore;
                            else 
                                opinionScore +=tmpScore*NEG_WEIGHT;
                        }
                    }
                    //没出现修饰词只计算倾向次本身的权重
                    if(flag==0)
                    {
                    //    System.out.println(opinionPosition);
                    //    System.out.println("nnnnnnnnnnnnn");
                        if(words.get(i).getPolar()==1)
                        {
                            opinionScore+=words.get(i).getWeight()*words.get(i).getPolar();
                    //        System.out.println(words.get(i).getWord());
                    //        System.out.println("wwwwwwwww");
                        }
                        else if(words.get(i).getPolar()==-1)
                        {
                            opinionScore+=words.get(i).getWeight()*words.get(i).getPolar()*NEG_WEIGHT;
                    //        System.out.println(words.get(i).getWord());
                        }
                    }
                }
            }
            //System.out.println("最后得分:"+opinionScore);
            return opinionScore;
        }
        
        //计算一般新闻的倾向性
        public void opinion(Set<String> keyword,String text,String title)
        {
            posCount=0;
            negCount=0;
            System.out.println("opinion");
            //计算title的倾向性
            shortTextOpinion(keyword,title);
            
            Set<String> sentences = new HashSet();
            String[] array=text.split(" ");
            //System.err.println(array.length);
            for(int i=0;i<array.length;i++)
            {
                sentences.add(array[i]);
            }
            Iterator KwordIter=keyword.iterator();
            Iterator senIter=sentences.iterator();
            while(KwordIter.hasNext())
            {
                String kwordIt=KwordIter.next().toString();
                while(senIter.hasNext())
                {
                    String senIt=senIter.next().toString();
                //    String kwordIt=KwordIter.next().toString();
                    if((senIt.indexOf(kwordIt))!=-1)
                    {
                        //单个句子倾向性得分
                        int value=sentenceScore(senIt);
                        if(value>0)
                            posCount++;
                        else if(value<0)
                            negCount +=NEG_WEIGHT;
                    }
                }
            }
        }
        
        //计算短文本如微博的倾向性
        public void shortTextOpinion(Set<String> keyword,String text)
        {
            System.out.println("shortTextOpinion");
            posCount=0;
            negCount=0;
            
            int kwordP=0;
            int owordP=0;
            
            Iterator kwordIter=keyword.iterator();
            while(kwordIter.hasNext())
            {
                String kwordIt=kwordIter.next().toString();
                kwordP=text.indexOf(kwordIt);
                //文本中存在关键词
                if(kwordP!=-1)
                {
                    int opinionScore=0;
                    int pairPosition=0;
                    
                    StringBuffer wordPair=new StringBuffer();
                    
                    for(int i=0;i<words.size();i++)
                    {
                        owordP=text.indexOf(words.get(i).getWord());
                        if(owordP!=-1)
                        {
                            //是否出现词对
                            int flag=0;
                            for(int j=0;j<adjectives.size();j++)
                            {
                                wordPair.append(adjectives.get(j).getWord());
                                wordPair.append(words.get(i).getWord());
                                pairPosition=text.indexOf(wordPair.toString());
                                if(pairPosition!=-1)
                                {
                                    flag=1;
                                    int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                                    if(tmpScore>0)
                                        opinionScore +=tmpScore;
                                    else 
                                        opinionScore +=NEG_WEIGHT*tmpScore;
                                }
                            }
                            if(flag==0)
                            {
                                if(words.get(i).getPolar()==1)
                                    opinionScore +=words.get(i).getWeight()*words.get(i).getPolar();
                                else if(words.get(i).getPolar()==-1)
                                    opinionScore +=NEG_WEIGHT*words.get(i).getWeight()*words.get(i).getPolar();
                            }
                        }
                    }
                    if(opinionScore>0)
                        posCount +=TITLE_WEIGHT;
                    else if(opinionScore<0)
                        negCount +=TITLE_WEIGHT*NEG_WEIGHT;
                } 
            }
        }
        
        //media=3为微博采用短文本倾向性,第二个参数为空
        public void analyse(int media,Set<String> keyword,String text,String title)
        {
            if(media ==3)
            {
                System.out.println("media=3");
                shortTextOpinion(keyword,title);
            }
            else
            {
                System.out.println("media=1");
                opinion(keyword,text,title);
            }
        }
        
        //最终倾向性
        public int getPolar()
        {
            if(posCount>negCount)
                return 1;
            else if(negCount>posCount)
                return -1;
            else
                return 0;
        }
        
        public static void main(String[] args) throws SQLException
        {
            OpinionAnalyser a=new OpinionAnalyser();
            a.OpinionAnalyser();
            a.sentenceScore("好不好!");
            String str="心情很好";
            System.out.println("文本倾向性:"+a.sentenceScore(str));
            //String text="兴业证券正面临着暴跌!需要采取一定的措施来进行抵御!";
            //Set <String> keyword = new HashSet();
            //keyword.add("兴业证券");
            //keyword.add("金融危机");
            //String title="兴业证券面临金融危机";
            
            //a.analyse(1, keyword, text, title);
            //System.out.println("该文本最后倾向性:"+a.getPolar());
        }
    
        
    }
  • 相关阅读:
    笔记35 跨重定向请求传递数
    判断邮箱的正则表达式
    按钮
    async await 的用法
    笔记34 Spring MVC的高级技术——处理multipart形式的数据
    Convert Sorted Array to Binary Search Tree
    Binary Tree Zigzag Level Order Traversal
    Unique Binary Search Trees,Unique Binary Search Trees II
    Validate Binary Search Tree
    Populating Next Right Pointers in Each Node,Populating Next Right Pointers in Each Node II
  • 原文地址:https://www.cnblogs.com/zeze/p/5331650.html
Copyright © 2011-2022 走看看