zoukankan      html  css  js  c++  java
  • 唐诗词组频率,高频字居然跟宋词相似

    既然R语言的程序已经能运行了,那直接把唐诗的也统计一下。(有空还是用C++写一下,R非常简洁,但是判断不是很精确。)

    l = scan("tangshi.txt", "character", sep = "\n");
    l.len = nchar(l);
     
    # 某些行是作者和标题,所以选取长度大于10的行;
    # 另外这个文本文件不太规整,有些网址什么的,
    # 所以也要排除那些长度太长的。
    ci = l[l.len > 10 & l.len < 500];
     
    # 句子用标点符号分割。
    sentences = strsplit(ci, ",|。|!|?|、");
    sentences = unlist(sentences);
    sentences = sentences[sentences != ""];
    s.len = nchar(sentences);

    #获取的分词的长度
    group = 2;
    # 单句太长了说明有可能是错误的字符,去除掉。
    sentences = sentences[s.len <= 10 & s.len >=group];
    s.len = nchar(sentences);
     
    # 暴力挨个拆分,比如“犹解嫁东风”的所有二字组合为
    # “犹解”“解嫁”“嫁东”“东风”,
    # 无意义的词其频数自然就落在后面了。
    splitwords = function(x, x.len) substring(x, 1:(x.len+1 - group), group:x.len);
     
    words = mapply(splitwords, sentences, s.len, SIMPLIFY = TRUE, USE.NAMES = FALSE);
    words = unlist(words);
    words.freq = table(words);
    words.freq = sort(words.freq, decreasing = TRUE);
    df<-data.frame(Word = names(words.freq[1:100]), Freq = as.integer(words.freq[1:100]));
    write.table(df, "1.txt");


    两个词

    "Word" "Freq"
    "1" "何处" 1653
    "2" "不知" 1457
    "3" "万里" 1439
    "6" "千里" 1294
    "7" "今日" 1150
    "8" "不见" 1139
    "9" "不可" 1133
    "10" "春风" 1118
    "11" "白云" 1099
    "12" "不得" 942
    "13" "明月" 888
    "14" "人间" 879
    "15" "无人" 869
    "16" "风吹" 831
    "17" "故人" 784
    "18" "惆怅" 768
    "19" "秋风" 745
    "20" "悠悠" 733
    "21" "相思" 723
    "22" "长安" 721
    "23" "白日" 687
    "24" "如何" 683
    "25" "十年" 674
    "26" "青山" 662
    "27" "何人" 655
    "28" "少年" 628
    "29" "相逢" 627
    "30" "平生" 585
    "31" "寂寞" 584
    "32" "天子" 584
    "33" "天地" 581
    "34" "黄金" 578
    "35" "年年" 578
    "36" "人不" 576
    "37" "何事" 573
    "38" "江上" 555
    "39" "流水" 548
    "40" "回首" 531
    "41" "可怜" 531
    "42" "主人" 521
    "43" "如此" 520
    "44" "白发" 516
    "45" "今朝" 513
    "46" "从此" 503
    "47" "日月" 502
    "48" "月明" 502
    "49" "行人" 500
    "50" "落日" 493
    "51" "不如" 492
    "52" "将军" 492
    "53" "归去" 489
    "54" "日暮" 482
    "55" "别离" 478
    "56" "洛阳" 476
    "57" "不能" 471
    "58" "此时" 470
    "59" "天下" 470
    "60" "何时" 469
    "61" "无事" 467
    "62" "芳草" 466
    "63" "江南" 463
    "64" "相见" 462
    "65" "归来" 461
    "66" "夕阳" 458
    "67" "当时" 454
    "68" "杨柳" 451
    "69" "风雨" 448
    "70" "》)" 445
    "71" "东风" 436
    "72" "洞庭" 433
    "73" "青云" 432
    "74" "花落" 428
    "75" "参差" 427
    "76" "天涯" 426
    "77" "芙蓉" 425
    "78" "落花" 424
    "79" "清风" 421
    "80" "不是" 416
    "81" "烟霞" 416
    "82" "三十" 414
    "83" "白头" 413
    "84" "桃花" 411
    "85" "不相" 410
    "86" "唯有" 407
    "87" "何如" 404
    "88" "南山" 397
    "89" "谁能" 395
    "90" "君不" 394
    "91" "千年" 391
    "92" "天上" 389
    "93" "如今" 385
    "94" "花开" 382
    "95" "桃李" 380
    "96" "与君" 380
    "97" "此地" 378
    "98" "殷勤" 378
    "99" "浮云" 376
    "100" "君王" 375


    3个词

    "Word" "Freq"
    "6" "君不见" 224
    "11" "不知何" 127
    "13" "行路难" 108
    "14" "三千里" 108
    "17" "不可见" 100
    "22" "知何处" 90
    "23" "在何处" 89
    "24" "二十年" 87
    "28" "三十六" 85
    "30" "三十年" 75
    "31" "无消息" 74
    "32" "不相见" 73
    "33" "何处去" 70
    "34" "无一事" 70
    "35" "洛阳城" 69
    "36" "千万里" 69
    "38" "何处是" 68
    "40" "水东流" 67
    "44" "归未得" 65
    "45" "向人间" 63
    "46" "歌一曲" 62
    "49" "千里外" 61
    "50" "一杯酒" 61
    "52" "明月夜" 58
    "53" "归何处" 57
    "54" "从此去" 56
    "55" "东风吹" 56
    "56" "今何在" 55
    "57" "皮日休" 55
    "58" "人不知" 55
    "59" "春风吹" 54
    "61" "不知谁" 53
    "62" "草萋萋" 53
    "63" "归去来" 53
    "64" "不得意" 52
    "65" "人不见" 52
    "66" "无人知" 52
    "67" "长安道" 52
    "68" "复何如" 51
    "69" "人间事" 51
    "70" "与君同" 51

  • 相关阅读:
    tabhost切换标签:Log中出现You must supply a layout_width attribute的解决方法
    listview去掉底部多出的边框黑色
    使用fragmenttabhost后,子fragment怎么获取ID?怎么用getSharedPreferences
    android用shape给linearLayout设置边框,怎样只保留底部或顶部的边框,把其它三个方向的边框去掉呢?
    linux删除文件未释放空间问题处理
    mount: unknown filesystem type 'LVM2_member'解决方案【转】
    centos系统lvm的安装
    一个或多个音频服务未运行 win7 错误1079:此服务的账户不同于运行于同一进程上的其他服务账户
    php SimpleXML
    new JSONObject(str)无法解析 报错:org.json.JSONException: Value of type java.lang.String cannot be converted to JSONObject
  • 原文地址:https://www.cnblogs.com/marryZhan/p/2497574.html
Copyright © 2011-2022 走看看