zoukankan      html  css  js  c++  java
  • Codeforces Gym 100338B Spam Filter 字符串哈希+贝叶斯公式

    原题链接:http://codeforces.com/gym/100338/attachments/download/2136/20062007-winter-petrozavodsk-camp-andrew-stankevich-contest-22-asc-22-en.pdf

    题意

    这是一个过滤垃圾邮件的算法,叫贝叶斯算法。这个算法的第一步是训练过程,通过人工给定的邮件,来确定每个词语在垃圾邮件中的概率和在普通邮件的概率。然后通过贝叶斯公式来计算每个邮件是否为垃圾邮件。具体过程可以看题,或者维基百科。

    题解

    模拟题目的过程即可,不过要注意的是,为了避免超时,必须哈希,使用 最小表示来记录字符串。

    代码

    //#include<iostream>
    #include<cstring>
    #include<fstream>
    #include<vector>
    #include<map>
    #include<algorithm>
    #include<string>
    #include<set>
    #include<cmath>
    #include<queue>
    #define eps 1e-10
    #define MAX_N 1234
    #define Pr 131
    #define mod 1000000009
    using namespace std;
    
    typedef long long ll;
    
    int s,g,n,t;
    string sp[MAX_N];
    string go[MAX_N];
    string ma[MAX_N];
    
    set<ll> spam[MAX_N];
    set<ll> good[MAX_N];
    set<ll> mail[MAX_N];
    
    set<ll> allWord;
    
    string bankLine;
    
    map<ll, double> wordIsSpam;
    map<ll, double> wordIsGood;
    
    double pSpam;
    double pGood;
    
    ll Hash(string ss) {
        ll tmp = Pr;
        ll res = 0;
        for (auto c:ss) {
            res = (res + c * tmp) % mod;
            tmp = (tmp * Pr) % mod;
        }
        return res;
    }
    
    string changeToSmall(string ss) {
        string res = "";
        for (auto c:ss) {
            if (c <= 'Z' && c >= 'A')c = c - 'A' + 'a';
            res = res + c;
        }
        return res;
    }
    
    bool isAl(char c) {
        if (c <= 'Z' && c >= 'A')return true;
        return (c <= 'z' && 'a' <= c);
    }
    
    void divi(string v[],set<ll> G[],int x) {
        for (int i = 0; i < x; i++) {
            int p = 0;
            while ((!isAl(v[i][p])) && p < v[i].length())p++;
            bool flag = true;
            for (int j = p; j < v[i].length(); j++) {
                if ((!isAl(v[i][j])) && flag) {
                    flag = false;
                    string tmp;
                    tmp.assign(v[i].begin() + p, v[i].begin() + j);
                    tmp = changeToSmall(tmp);
                    allWord.insert(Hash(tmp));
                    G[i].insert(Hash(tmp));
                }
                else if (isAl(v[i][j]) && flag == false) {
                    p = j;
                    flag = true;
                }
            }
        }
    }
    
    double divide(double a,double b) {
        if (fabs(b)<eps)return 0;
        return a / b;
    }
    
    double P(ll word) {
        double ws, wg;
    
        if (wordIsSpam.find(word) == wordIsSpam.end())ws = 0;
        else ws = wordIsSpam[word];
    
        if (wordIsGood.find(word) == wordIsGood.end())wg = 0;
        else wg = wordIsGood[word];
    
        return divide(ws * pSpam, ws * pSpam + wg * pGood);
    }
    
    int main() {
        ifstream cin("spam.in");
        ofstream cout("spam.out");
        cin.sync_with_stdio(false);
        cin >> s >> g >> n >> t;
        pSpam = divide(s, s + g);
        pGood = divide(g, s + g);
        getline(cin, bankLine);
        for (int i = 0; i < s; i++) {
            getline(cin, sp[i]);
            sp[i] = sp[i] + " ";
        }
        for (int i = 0; i < g; i++) {
            getline(cin, go[i]);
            go[i] = go[i] + " ";
        }
        for (int i = 0; i < n; i++) {
            getline(cin, ma[i]);
            ma[i] = ma[i] + " ";
        }
        divi(sp, spam, s);
        divi(go, good, g);
        divi(ma, mail, n);
    
        for (auto word:allWord) {
            int cnt = 0;
            for (int i = 0; i < s; i++)
                if (spam[i].find(word) != spam[i].end())cnt++;
            wordIsSpam[word] = divide(cnt, s);
            cnt = 0;
            for (int i = 0; i < g; i++)
                if (good[i].find(word) != good[i].end())cnt++;
            wordIsGood[word] = divide(cnt, g);
        }
        for (int i = 0; i < n; i++) {
            double ans = 0;
            for (auto word:mail[i]) {
                double p = P(word);
                //cout<<word<<": "<<p<<endl;
                if (p > 0.5 || fabs(p - 0.5) < eps)ans++;
            }
            if (ans * 100 / mail[i].size() < t)cout << "good" << endl;
            else cout << "spam" << endl;
        }
        return 0;
    }
  • 相关阅读:
    Argparse4j
    Socket通信中的 BeginReceive与EndReceive
    socket 异步通信的一些问题
    static 还是readonly 还是static readonly
    隐藏Android下的虚拟按键
    IIS上架设https网站证书处理备忘
    [转] 移动平台Html5的viewport使用经验
    Air File.load加载问题
    x64位windows 2003 server中“Server 对象 错误 'ASP 0177 : 800700c1' Server.CreateObject 失败”问题
    报 "错误: 无法取消引用int" 的问题解决纪录
  • 原文地址:https://www.cnblogs.com/HarryGuo2012/p/4748809.html
Copyright © 2011-2022 走看看