zoukankan      html  css  js  c++  java
  • 文本倾向性分析

    package test;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Set;
    import java.util.Vector;
    
    
    
    public class OpinionAnalyser {
        //倾向词表
        public Vector <Word> words=new Vector <Word>();
        //修饰词表
        public Vector <Word> adjectives=new Vector <Word>();
        //描述词表
        public Vector <Word> descriptions=new Vector <Word>();
        //正面句子数
        public int posCount;
        //负面句子数
        public int negCount;
        
        static String SERVER="59.77.233.*";
        static String USER="";
        static String PASSWORD="";
        static String DATABASE="skycent";
        
        //负面词的权重,为2表示负面词是正面词权重的两倍
        static int NEG_WEIGHT=2;
        static int TITLE_WEIGHT=10;
        
    
        
        private static int atoi(String s)
        {
            return Integer.parseInt(s);
        }
        
        //读取数据库初始化三个词表和其他成员变量
        public void OpinionAnalyser() throws SQLException
        {
            ConnDB conndb;
            PreparedStatement stmt = null;
    //        PreparedStatement stmt = null;
            ResultSet rs = null;
            conndb = new ConnDB(SERVER, USER, PASSWORD, DATABASE);
            conndb.executeUpdate("SET NAMES 'utf8mb4'");
            
            //获取倾向性词表
            String strSQL = "select word,polar,weight from twordlist";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("polar"));
                int weight=atoi(rs.getString("weight"));
            //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,weight);
                words.addElement(tmp);    
            //    System.out.println(polar+" "+weight);
            }
            
            //获取描述词表
            strSQL = "select word,type from twordlist_ms";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist_ms没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("type"));
            //    System.out.println(polar+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,0);
                descriptions.addElement(tmp);    
            
            }
            
            //获取修饰词表
            strSQL = "select word,polar,weight from twordlist_xs";
            try {
                stmt = conndb.getConnection().prepareStatement(strSQL);
                rs = stmt.executeQuery();
                } catch (SQLException e1) {
                    e1.printStackTrace();
                }
            //            处理空集情况
            if (rs.next() == false) {
                System.out.println("twordlist_xs没有词!");
                } 
            else{
                rs.previous();
                }
            while(rs.next())
            {
                int polar=atoi(rs.getString("polar"));
                int weight=atoi(rs.getString("weight"));
            //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
                Word tmp=new Word(rs.getString("word"),polar,weight);
                adjectives.addElement(tmp);    
            
            }
            posCount=0;
            negCount=0;
            
            conndb.close();
        }
        
        //句子倾向性得分
        public int sentenceScore(String sentence)
        {
            int opinionScore=0;
            //是否出现倾向词
            int opinionPosition=0;
            
            for(int i=0;i<words.size();i++)
            {
                //找到倾向性词表
                opinionPosition=sentence.indexOf(words.get(i).getWord());
            //    System.out.println(opinionPosition);
                
                if(opinionPosition!=-1)
                {
                    //是否出现修饰词+倾向词
                    int flag=0;
                    for(int j=0;j<adjectives.size();j++)
                    {
                        StringBuffer wordPair=new StringBuffer();
                        wordPair.append(adjectives.get(j).getWord());
                        wordPair.append(words.get(i).getWord());
                        int pairPosition =0;
                        pairPosition=sentence.indexOf(wordPair.toString());
                        
                        if(pairPosition!=-1)
                        {
                        //    System.out.println("yeyeyeyey");
                            flag=1;
                            int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                            
                            if(tmpScore>0)
                                opinionScore +=tmpScore;
                            else 
                                opinionScore +=tmpScore*NEG_WEIGHT;
                        }
                    }
                    //没出现修饰词只计算倾向次本身的权重
                    if(flag==0)
                    {
                    //    System.out.println(opinionPosition);
                    //    System.out.println("nnnnnnnnnnnnn");
                        if(words.get(i).getPolar()==1)
                        {
                            opinionScore+=words.get(i).getWeight()*words.get(i).getPolar();
                    //        System.out.println(words.get(i).getWord());
                    //        System.out.println("wwwwwwwww");
                        }
                        else if(words.get(i).getPolar()==-1)
                        {
                            opinionScore+=words.get(i).getWeight()*words.get(i).getPolar()*NEG_WEIGHT;
                    //        System.out.println(words.get(i).getWord());
                        }
                    }
                }
            }
            //System.out.println("最后得分:"+opinionScore);
            return opinionScore;
        }
        
        //计算一般新闻的倾向性
        public void opinion(Set<String> keyword,String text,String title)
        {
            posCount=0;
            negCount=0;
            System.out.println("opinion");
            //计算title的倾向性
            shortTextOpinion(keyword,title);
            
            Set<String> sentences = new HashSet();
            String[] array=text.split(" ");
            //System.err.println(array.length);
            for(int i=0;i<array.length;i++)
            {
                sentences.add(array[i]);
            }
            Iterator KwordIter=keyword.iterator();
            Iterator senIter=sentences.iterator();
            while(KwordIter.hasNext())
            {
                String kwordIt=KwordIter.next().toString();
                while(senIter.hasNext())
                {
                    String senIt=senIter.next().toString();
                //    String kwordIt=KwordIter.next().toString();
                    if((senIt.indexOf(kwordIt))!=-1)
                    {
                        //单个句子倾向性得分
                        int value=sentenceScore(senIt);
                        if(value>0)
                            posCount++;
                        else if(value<0)
                            negCount +=NEG_WEIGHT;
                    }
                }
            }
        }
        
        //计算短文本如微博的倾向性
        public void shortTextOpinion(Set<String> keyword,String text)
        {
            System.out.println("shortTextOpinion");
            posCount=0;
            negCount=0;
            
            int kwordP=0;
            int owordP=0;
            
            Iterator kwordIter=keyword.iterator();
            while(kwordIter.hasNext())
            {
                String kwordIt=kwordIter.next().toString();
                kwordP=text.indexOf(kwordIt);
                //文本中存在关键词
                if(kwordP!=-1)
                {
                    int opinionScore=0;
                    int pairPosition=0;
                    
                    StringBuffer wordPair=new StringBuffer();
                    
                    for(int i=0;i<words.size();i++)
                    {
                        owordP=text.indexOf(words.get(i).getWord());
                        if(owordP!=-1)
                        {
                            //是否出现词对
                            int flag=0;
                            for(int j=0;j<adjectives.size();j++)
                            {
                                wordPair.append(adjectives.get(j).getWord());
                                wordPair.append(words.get(i).getWord());
                                pairPosition=text.indexOf(wordPair.toString());
                                if(pairPosition!=-1)
                                {
                                    flag=1;
                                    int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                                    if(tmpScore>0)
                                        opinionScore +=tmpScore;
                                    else 
                                        opinionScore +=NEG_WEIGHT*tmpScore;
                                }
                            }
                            if(flag==0)
                            {
                                if(words.get(i).getPolar()==1)
                                    opinionScore +=words.get(i).getWeight()*words.get(i).getPolar();
                                else if(words.get(i).getPolar()==-1)
                                    opinionScore +=NEG_WEIGHT*words.get(i).getWeight()*words.get(i).getPolar();
                            }
                        }
                    }
                    if(opinionScore>0)
                        posCount +=TITLE_WEIGHT;
                    else if(opinionScore<0)
                        negCount +=TITLE_WEIGHT*NEG_WEIGHT;
                } 
            }
        }
        
        //media=3为微博采用短文本倾向性,第二个参数为空
        public void analyse(int media,Set<String> keyword,String text,String title)
        {
            if(media ==3)
            {
                System.out.println("media=3");
                shortTextOpinion(keyword,title);
            }
            else
            {
                System.out.println("media=1");
                opinion(keyword,text,title);
            }
        }
        
        //最终倾向性
        public int getPolar()
        {
            if(posCount>negCount)
                return 1;
            else if(negCount>posCount)
                return -1;
            else
                return 0;
        }
        
        public static void main(String[] args) throws SQLException
        {
            OpinionAnalyser a=new OpinionAnalyser();
            a.OpinionAnalyser();
            a.sentenceScore("好不好!");
            String str="心情很好";
            System.out.println("文本倾向性:"+a.sentenceScore(str));
            //String text="兴业证券正面临着暴跌!需要采取一定的措施来进行抵御!";
            //Set <String> keyword = new HashSet();
            //keyword.add("兴业证券");
            //keyword.add("金融危机");
            //String title="兴业证券面临金融危机";
            
            //a.analyse(1, keyword, text, title);
            //System.out.println("该文本最后倾向性:"+a.getPolar());
        }
    
        
    }
  • 相关阅读:
    命令[34]
    命令[33]
    命令[27]
    命令[38]
    命令[19]
    命令[22]
    命令[30]
    命令[37]
    命令[23]
    命令[26]
  • 原文地址:https://www.cnblogs.com/zeze/p/5331650.html
Copyright © 2011-2022 走看看