zoukankan      html  css  js  c++  java
  • java大作业 KShinglingAlgorithm

    wiki上关于KShingling Algorithm(w-shingling)的说明:

    http://en.wikipedia.org/wiki/W-shingling

    摘要:

    In natural language processing a w-shingling is a set of unique "shingles"—contiguous subsequences of tokens in a document—that can be used to gauge the similarity of two documents. The w denotes the number of tokens in each shingle in the set.

    The document, "a rose is a rose is a rose" can be tokenized as follows:

    (a,rose,is,a,rose,is,a,rose)

    The set of all contiguous sequences of 4 tokens (N-grams, here: 4-grams) is

    { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is), (a,rose,is,a), (rose,is,a,rose) } = { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is) }

    我理解的此算法,是把每段文本都像上述分解后,统计两段文本的合集b,再统计交集a,用a/b得到相似度。

    写得有些复杂:

      1 package bigproject2;
      2 
      3 import javax.swing.JOptionPane;
      4 
      5 public class union {
      6     //求子集
      7     public String[] ziji(String str)
      8     {
      9         char[] ch=str.toCharArray();
     10         int c=0;
     11         for(int i=0;i<ch.length;i++)
     12         {
     13             if(ch[i]==' ')
     14                 c++;
     15         }
     16         //建立单词数组
     17         String[] strt=new String[c+1];
     18         for(int i=0;i<c+1;i++)
     19             strt[i]="";
     20         int h=0;
     21         for(int i=0;i<c+1;i++)
     22         {
     23             for(int j=h;j<ch.length;j++)
     24             {
     25                 if(ch[j]==' ')
     26                 {
     27                     h=j+1;
     28                     break;
     29                 }
     30                 else strt[i]+=ch[j];
     31             }
     32         }
     33         return strt;
     34     }
     35     //按k分,并去掉重复子集。
     36     public String[] cut(String[] str,int k) throws MyException{
     37         if(str.length<k)
     38                 throw new MyException("单词数少于"+k+",无法进行计算!");
     39         String[] t=new String[str.length-k+1];
     40         for(int i=0;i<str.length-k+1;i++)
     41             t[i]="";
     42         int h=0,m=0;
     43         for(;h<str.length-k+1;h++)
     44         {
     45             for(int i=m;i<m+k;i++)
     46                 t[h]+=str[i];
     47             m++;
     48         }
     49         //去掉重复部分
     50         int merge=0;
     51         for(int i=0;i<t.length-1;i++)
     52         {
     53             if(t[i].equals("")) break;
     54             for(int j=i+1;j<t.length;j++)
     55             {
     56                 if(t[i].equals(t[j]))
     57                 {
     58                     merge++;
     59                     int y=j;
     60                     for(;y<t.length-1;y++)
     61                     {
     62                         t[y]=t[y+1];
     63                     }
     64                     t[y]="";
     65                 }
     66             }
     67         }
     68         String[] fin=new String[t.length-merge];
     69         for(int i=0;i<t.length-merge;i++)
     70             fin[i]=t[i];
     71         return fin;
     72     }
     73     public class MyException extends Exception{
     74         public MyException(String str){
     75             JOptionPane.showMessageDialog(null, str,"警告", JOptionPane.INFORMATION_MESSAGE);
     76         }
     77     }
     78     //求两字符串数组合集个数。
     79     public int heji(String[] a,String[] b){
     80         int count=a.length+b.length;
     81         for(int i=0;i<a.length;i++)
     82         {
     83             for(int j=0;j<b.length;j++)
     84             {
     85                 if(a[i].equals(b[j]))
     86                     count--;
     87             }
     88         }
     89         return count;
     90     }
     91     //求两字符串数组交集个数。
     92     public int jiaoji(String[] a,String[] b){
     93         int count=0;
     94         for(int i=0;i<a.length;i++)
     95         {
     96             for(int j=0;j<b.length;j++)
     97             {
     98                 if(a[i].equals(b[j]))
     99                     count++;
    100             }
    101         }
    102         return count;
    103     }
    104 
    105 }
     1 package bigproject2;
     2 
     3 
     4 public class KShinglingAlgorithm extends union{
     5     private String text1,text2;
     6     public String getText1()
     7     {
     8         return text1;
     9     }
    10     public String getText2()
    11     {
    12         return text2;
    13     }
    14     public void setText1(String text1)
    15     {
    16         this.text1=text1;
    17     }
    18     public void setText2(String text2)
    19     {
    20         this.text2=text2;
    21     }
    22     
    23     public float getSimilarity(int k)
    24     {
    25        union a=new union();
    26        String[] t1=a.ziji(this.text1);
    27        String[] t2=a.ziji(this.text2);
    28        String[] t1t,t2t;
    29        try{
    30            t1t=a.cut(t1, k);
    31            t2t=a.cut(t2, k);
    32            
    33        }catch(MyException e){
    34                return -1;
    35        }
    36        int he=a.heji(t1t, t2t);
    37        int jiao=a.jiaoji(t1t, t2t);
    38        return (float)jiao/he;
    39     }
    40 
    41 }

    面板设计部分:

      1 package bigproject2;
      2 import java.awt.*;
      3 import java.awt.event.*;
      4 import java.io.BufferedReader;
      5 import java.io.File;
      6 import java.io.FileNotFoundException;
      7 import java.io.FileReader;
      8 import java.io.IOException;
      9 import java.io.InputStreamReader;
     10 
     11 import javax.swing.*;
     12 import javax.swing.event.*;
     13 import javax.swing.filechooser.FileNameExtensionFilter;
     14 
     15 public class Outlook extends JFrame{
     16     JFrame frm=new JFrame("相似度计算器");
     17     JPanel areabottom=new JPanel();
     18     JPanel areatop=new JPanel();
     19     JPanel areamiddle=new JPanel();
     20     static JTextArea tl=new JTextArea();
     21     static JTextArea tr=new JTextArea();
     22     JScrollPane left=new JScrollPane(tl,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS,
     23             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED);
     24     JScrollPane right=new JScrollPane(tr,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS,
     25             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED);
     26     JSplitPane sp=new JSplitPane(JSplitPane.HORIZONTAL_SPLIT,left,right);
     27     static JButton toBig=new JButton("全部大写");
     28     static JButton delbd=new JButton("去掉标点");
     29     static JButton count=new JButton("计算相似度");
     30     JLabel space=new JLabel("                                               ");
     31     JLabel t1=new JLabel("Text1");
     32     JLabel t2=new JLabel("Text2");
     33 
     34     JMenuBar mb=new JMenuBar();
     35     JMenu open=new JMenu("打开");
     36     JMenuItem opent1=new JMenuItem("打开到Text1");
     37     JMenuItem opent2=new JMenuItem("打开到Text2");
     38     
     39     private String str="";
     40     public Outlook()
     41     {
     42         judge();
     43         
     44         frm.setVisible(true);
     45         frm.setBounds(50, 50, 500, 400);
     46         frm.setLayout(new BorderLayout(5,5));
     47         
     48         frm.add("North",areatop);
     49         frm.add("Center",areamiddle);
     50         frm.add("South",areabottom);
     51         
     52         areatop.add(mb);
     53         mb.add(open);        
     54         open.add(opent1);
     55         open.add(opent2);
     56         open.setPreferredSize(new Dimension(40,18));
     57         mb.setBackground(frm.getBackground());
     58         areatop.setLayout(new FlowLayout(FlowLayout.LEFT));
     59         areamiddle.setLayout(new FlowLayout(FlowLayout.LEFT));
     60         
     61         areamiddle.add(t1);
     62         t1.setPreferredSize(new Dimension(frm.getWidth()/2-20,10));
     63         areamiddle.add(t2);
     64         t2.setPreferredSize(new Dimension(50,10));
     65         areamiddle.add(left);
     66         left.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2));    
     67         areamiddle.add(right);
     68         right.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2));
     69         tl.setLineWrap(true);
     70         tr.setLineWrap(true);
     71         
     72         areabottom.add(toBig);
     73         areabottom.add(delbd);
     74         areabottom.add(space);
     75         areabottom.add(count);
     76         
     77         opent1.addActionListener(new ActionListener(){
     78             public void actionPerformed(ActionEvent e) {
     79                 try {
     80                     openfile();
     81                     tl.setText(str);
     82                 } catch (IOException e1) {
     83                     e1.printStackTrace();
     84                 }
     85                 judge();
     86             }
     87         });
     88         opent2.addActionListener(new ActionListener(){
     89             public void actionPerformed(ActionEvent e) {
     90                 try {
     91                     openfile();
     92                     tr.setText(str);
     93                 } catch (IOException e1) {
     94                     e1.printStackTrace();
     95                 }
     96                 judge();
     97             }
     98         });
     99         toBig.addActionListener(new ActionListener(){
    100             public void actionPerformed(ActionEvent e){
    101                 tl.setText(tobig(tl.getText()));
    102                 tr.setText(tobig(tr.getText()));
    103             }
    104         });
    105         
    106         delbd.addActionListener(new ActionListener(){
    107             public void actionPerformed(ActionEvent e){
    108                 tl.setText(del(tl.getText()));
    109                 tr.setText(del(tr.getText()));
    110                 judge();
    111             }
    112             
    113         });
    114         count.addActionListener(new ActionListener(){
    115             public void actionPerformed(ActionEvent e){
    116                 KShinglingAlgorithm a=new KShinglingAlgorithm();
    117                 a.setText1(tl.getText());
    118                 a.setText2(tr.getText());
    119                 float b=a.getSimilarity(4);
    120                 if(b!=-1)
    121                     JOptionPane.showMessageDialog(null, Float.toString(b),"相似度", JOptionPane.INFORMATION_MESSAGE); 
    122             }
    123         });
    124         tr.addKeyListener(new KeyAdapter(){
    125             public void keyTyped(KeyEvent e){
    126                 judge();
    127             }
    128         });
    129         tl.addKeyListener(new KeyAdapter(){
    130             public void keyTyped(KeyEvent e){
    131                 judge();
    132             }
    133         });
    134     }
    135     public void judge(){
    136         if(tl.getText().length()!=0||tr.getText().length()!=0) {
    137             toBig.setEnabled(true);
    138             delbd.setEnabled(true);
    139             count.setEnabled(true);
    140         }
    141         else{
    142             toBig.setEnabled(false);
    143             delbd.setEnabled(false);
    144             count.setEnabled(false);
    145         }    
    146     }
    147     public void openfile() throws IOException{
    148         str="";
    149         JFileChooser choose=new JFileChooser();        
    150         int result = choose.showOpenDialog(this);
    151         File file = null; //注意初始化
    152         //加过滤器
    153         if (result == JFileChooser.APPROVE_OPTION) {
    154             file = choose.getSelectedFile();
    155             }
    156         else{
    157             return; //使点取消后不会抛出异常
    158         }
    159         FileReader fr=new FileReader(file);
    160         BufferedReader br=new BufferedReader(fr);
    161         char c[]=new char[512];
    162         String strline="";
    163         while(br.ready()){
    164             strline=br.readLine();
    165             str+=strline;
    166         };
    167         br.close();
    168         fr.close();
    169     }
    170     public String tobig(String str){
    171         String temp="";
    172         for(int i=0;i<str.length();i++)
    173         {
    174             if(str.charAt(i)>='a'&&str.charAt(i)<='z')
    175             {
    176                 char t=str.charAt(i);
    177                 t=(char)(str.charAt(i)-32);
    178                 temp+=t;
    179             }
    180             else temp+=str.charAt(i);
    181         }
    182         return temp;
    183     }
    184     
    185     public String del(String str){
    186         String temp="";
    187         for(int i=0;i<str.length();i++)
    188         {
    189             char t=str.charAt(i);
    190             if(t>='!'&&t<='/'||t>=58&&t<=64||t>=91&&t<=96||t>=123&&t<=126);
    191             else temp+=t;
    192         }
    193         return temp;
    194     }
    195     public static void main(String[] args){
    196         new Outlook();
    197         
    198         
    199     }
    200 }
    Outlook
  • 相关阅读:
    ConcurrentHashMap总结
    HashMap在多线程环境下操作可能会导致程序死循环
    oracle数据库的 to char 和to date 区别(时间格式化)
    SQL中的cast()函数用法
    常见的垃圾收集器有3类-java面试一
    mybatis中sql引用
    mysql find_in_set 查询
    用Redis实现微博关注关系的分析
    C#与C++相比较之STL篇(续一)
    Vite2.0 入门
  • 原文地址:https://www.cnblogs.com/verlen11/p/4184407.html
Copyright © 2011-2022 走看看