zoukankan      html  css  js  c++  java
  • miniblast_hash算法c语言实现

    对于一组基因文件中的基因序列,选取一段基因片段,作为索引,利用hash表,查找固定的基因片段。有一定的并且容忍错误。

    简单讲就是自己实现一个hashtable,将选出特定字符串建立索引,便于查询。输出时可容忍一定数量的错误。

    贴上代码

    HashTable.h

    #include <iostream>
    using namespace std;
    
    typedef int KeyType;
    
    #define NULLKEY -1
    
    struct Entry {
        KeyType _key;
        string    _value;
        Entry(KeyType key = NULLKEY, string value = "") :_key(key), _value(value) {}
    };
    
    class hashTable {
    public:
        hashTable();
        hashTable::hashTable(int table_size, double occupancy_get);
        //hashTable(int tableSize);
        ~hashTable();
    
        bool find(const Entry&  e);
        string hashTable::getValue(const KeyType& k);
        bool insert(const Entry& e);
        bool remove(const Entry& e);
        void clear();
        Entry& operator[](KeyType key);//重载下标操作;当找不到key对应的Entry时,插入Entry(key,0)
        int size();
        void display();
    
    protected:
        int hashFunction(KeyType key);//将键值映射到对应地址
        void rehash();//调整hashTable大小
        bool find(const KeyType& k);//按键值查找
        int nextPrime();//p(n) = n^2 - n + 41, n<41, p<1681
    
    private:
        Entry *_pTable;
        int _pos;//当前访问元素的位置
        int _size;
        int _capacity;
        int primeIndex;
    };

    HashTable.cpp

    #include "HashTable.h"
    static double _occupancy = 0.5;
    //开放定址法
    hashTable::hashTable()
    {
        _capacity = 100;//初始化hashTable容量为100,便于观察rehash过程
        _pTable = new Entry[_capacity];
        _size = 0;
        primeIndex = 1;
    }
    hashTable::hashTable(int table_size,double occupancy_get)
    {
        _capacity = table_size;
        _occupancy = occupancy_get;
        _pTable = new Entry[_capacity];
        _size = 0;
        primeIndex = 1;
    }
    
    
    hashTable::~hashTable()
    {
        clear();
    }
    
    int hashTable::nextPrime()
    {
        int p = std::pow(static_cast<float>(primeIndex), 2) - primeIndex +3777;
        primeIndex = primeIndex << 1;
        if (primeIndex >= 3777) {
            cout << "Max capacity reached. exit!" << endl;
            exit(-1);
        }
        return p;
    }
    
    bool hashTable::find(const Entry&  e)
    {
        return(find(e._key));
    }
    
    bool hashTable::find(const KeyType& k)
    {
        _pos = hashFunction(k);
        if (_pTable[_pos]._key == NULLKEY)
            return false;
        int lastPos = _pos;
        while (_pTable[_pos]._key != k) {
            if (++_pos%_capacity == lastPos)
                return false;
        }
        return true;
    }
    
    string hashTable::getValue(const KeyType& k)
    {
        _pos = hashFunction(k);
        if (_pTable[_pos]._key == NULLKEY)
            return "";
        int lastPos = _pos;
        while (_pTable[_pos]._key != k) {
            if (++_pos%_capacity == lastPos)
                return "";
        }
        return _pTable[_pos]._value;
    }
    
    bool hashTable::insert(const Entry& e)
    {
        if ((_size*1.0) / _capacity>_occupancy) //通过occupancy判断是否需要扩容
            rehash();//[OK]插入操作前,需要判断hash table是否需要扩容
        if (find(e))
            return false;
        _pTable[_pos] = e;
        ++_size;
        return true;
    }
    
    bool hashTable::remove(const Entry& e)
    {
        if (!find(e))
            return false;
        _pTable[_pos]._key = NULLKEY;
        --_size;
        //rehash();//移除操作后,需要判断hash table是否需要缩容
        return true;
    }
    
    void hashTable::clear()
    {
        delete[]_pTable;
        _size = _capacity = 0;
    }
    
    Entry& hashTable::operator[](KeyType key)
    {
        if (!find(key))
            insert(Entry(key, 0));
        return _pTable[_pos];
    }
    
    int hashTable::size()
    {
        return _size;
    }
    
    int hashTable::hashFunction(KeyType key)
    {
        return key%_capacity;
    }
    
    void hashTable::display()
    {
        cout << "capacity = " << _capacity << ", size = " << _size << endl;
        for (int i = 0; i<_capacity; i++) {
            if (_pTable[i]._key != NULLKEY);
    //            cout << "key=" << _pTable[i]._key << ",value=" << _pTable[i]._value << endl;
        }
    }
    
    
    void hashTable::rehash()
    {
        //cout << "begin rehash..." << endl;
        Entry *p = new Entry[_size];//用来暂存原哈希表
        for (int i = 0; i<_capacity; i++) {//i<_size不对;元素散列在容量为_capacity的hashTable中
            if (_pTable[i]._key != NULLKEY)
                *(p + i) = _pTable[i];
        }
        delete[]_pTable;
        int lastSize = _size;
        _size = 0;
    
        _capacity = nextPrime();
        _pTable = new Entry[_capacity];
        for (int i = 0; i<lastSize; i++)
            insert(*(p + i));
        delete[]p;
    }

    miniblast_hash.cpp

    #include <iostream>
    #include <string>
    #include <fstream>
    #include "HashTable.h"
    
    using namespace std;
    const int KMER_MAX = 100;
    
    //得到基因组文件中,position开始,长度为len的字符串
    string getStringByPos(string file_name, long position,int str_len) {
        //读入数据
        std::ifstream myfile(file_name);
        if (!myfile.is_open())
        {
            cout << "未成功打开文件" << endl;
        }
        string s = "";
        //移动文件指针到基因指定位置,因为有无用字符,需要边移动边判断
        for (long i = 0; i < position; i++)
        {
            char c;
            myfile.get(c);
            //移动指针的过程中要忽略无意义字符
            if (c=='/0'||c=='
    ')
                i--;
        }
        //截取指定长度的字符串
        for (long i = 0; i < str_len; i++)
        {
            char c;
            myfile.get(c);
            //忽略无意义字符
            if (c == '/0' || c == '
    ')
                i--;
            else
            {
                switch (c)
                {
                case 'A':
                    s.append("A");
                    break;
                case 'G':
                    s.append("G");
                    break;
                case 'C':
                    s.append("C");
                    break;
                case 'T':
                    s.append("T");
                    break;
                default:
                    break;
                }
            }
        }
        return s;
    }
    //比较检索基因片段与原始片段相差个数
    int compareString(string s1, string s2, int len) {
        int num = 0;
        for (int i = 0; i < len; i++)
        {
            if (s1[i] != s2[i])
                num++;
        }
        return num;
    }
    //搜索指定基因片段
    void searchGenome(string file_name,hashTable *pTable,int mistakeNum,int kmer,string query_string,string out_name) {
        char *kmer_cut = new char[kmer];
        int length = query_string.length();
        int findNum = 0;
        int table_size = pTable->size();
        //std::ofstream out(out_name, ios::app);//输出结果至文件
        for (int i = 0; i <kmer; i++)
        {
            kmer_cut[i] = query_string[i];
        }
        string kmer_string(&kmer_cut[0],&kmer_cut[kmer]);
        for (int i = 0; i <table_size; i++) {
            
            string get = pTable->getValue(i);
            if (kmer_string == get)
            {
                
                if (i + length > table_size)
                    continue;
                string ss = getStringByPos(file_name, i, length);
                int num = compareString(ss, query_string,length);
                if (num<=mistakeNum)
                {
                    findNum++;
                    cout <<i<<" "<<num<<" "<<ss << endl;
                //    out << i <<" "<< num <<" " << ss << endl;
                }
            }
        }
        
        if (findNum==0)
        {
            cout<< "No Match" << endl;
        //    out << "No Match" << endl;
        }
        //out.close();
    }
    //构建基因搜索用的hashtable索引
    void GenomeIndex(string genome_file_name,int kmer, hashTable *pTable) {
        //读入数据,建立索引
        std::ifstream myfile(genome_file_name);
        if (!myfile.is_open())
        {
            cout << "未成功打开文件" << endl;
        }
        char *c = new char[KMER_MAX];
        myfile.get(c, kmer + 1);
        string add(&c[0], &c[kmer]);
        int geno_loc = 0;
        pTable->insert(Entry(0, add));
        //cout << add << endl;
        while (!myfile.eof())
        {
            char next_char;
            myfile.get(next_char);
            //去除文件中无用字符
            if (next_char != '/0'&&next_char != '
    ')
            {
                for (int i = 0; i < kmer; i++) 
                    c[i] = c[i + 1];
                    
                c[kmer-1] = next_char;
                string add(&c[0], &c[kmer]);
                geno_loc++;
                pTable->insert(Entry(geno_loc, add));
                //cout << add <<geno_loc<< endl;
            }
        }
        myfile.close();
    }
    
    int main(int   argc, char*   argv[])
    {
        
        /*string input_file_name ="";
        string out_file_name = "";
        if (argc>1)
        {
            input_file_name = string(argv[1]);
            out_file_name = string(argv[2]);
        }*/
        //如果不需要命令行参数执行文件,直接赋值input_file_name就好
        //input_file_name = "input_small.txt";
        std::freopen(argv[1], "r", stdin);
        std::freopen(argv[2], "w", stdout);
        //std::ifstream inputfile(input_file_name);
        
        
        hashTable *pTable=new hashTable();
        /*if (!inputfile.is_open())
        {
            cout << "未成功打开文件" << endl;
        }*/
        //先读取命令文件,设置相关参数,初始值为随意指定,无意义
        string str_order = "";
        string genome_file_name = "";
        int table_size =20;
        double occupancy =0.4;
        int kmer = 4;
        int mistakeNum = 0;
        string query_string;
        bool isIndex = false;
        while (str_order!="quit")
        {
            cin >> str_order;
            if (str_order=="genome")
            {
                cin >> genome_file_name;
            }
            else if (str_order == "table_size") {
                cin >> table_size;
            }
            else if (str_order == "occupancy") {
                cin >> occupancy;
                pTable = new hashTable(table_size, occupancy);
            }
            else if (str_order == "kmer") {
                cin >> kmer;
            }
            else if (str_order == "query") {
                cin >> mistakeNum >> query_string;
                //首次进行查询前,要建立索引hashTable
                if (!isIndex)
                {
                    isIndex = true;
                    GenomeIndex(genome_file_name, kmer, pTable);
                }
                //std::ofstream out(out_file_name,ios::app);//输出结果至文件
                /*if (out.is_open())
                {
                    out << "Query: " << query_string <<endl;
                    
                }*/
                cout << "Query: " << query_string << endl;
                //out.close();
                searchGenome(genome_file_name, pTable, mistakeNum, kmer, query_string, "");
            }
        }
        
        delete pTable;//释放指针,防止内存泄露
        //getchar();//暂停程序,观察结果
        return 0;
    }

      因为需要,代码上有详细注解,欢迎交流学习。

    本博客所有内容为原创,转载需征求作者同意。
  • 相关阅读:
    汉诺塔系列问题: 汉诺塔II、汉诺塔III、汉诺塔IV、汉诺塔V、汉诺塔VI、汉诺塔VII
    2014工大校赛题目以及解
    三国武将查询系统 //Java 访问 数据库
    LR(1)文法分析器 //c++ 实现
    维护前面的position+主席树 Codeforces Round #406 (Div. 2) E
    区间->点,点->区间,线段树优化建图+dijstra Codeforces Round #406 (Div. 2) D
    有向图博弈+出度的结合 Codeforces Round #406 (Div. 2) C
    树的性质和dfs的性质 Codeforces Round #403 (Div. 2, based on Technocup 2017 Finals) E
    还不会做! 树上的gcd 树分治 UOJ33
    树上的构造 树分治+树重心的性质 Codeforces Round #190 (Div. 2) E
  • 原文地址:https://www.cnblogs.com/xueyudlut/p/9044984.html
Copyright © 2011-2022 走看看