本项目需要把数据存档位二进制文件,载入时只载入文件索引,通过索引,快速定位到数据内容,从而实现最小存储,最快速查找。下面代码是初步实现,通过扩展,还实现搜索引擎关键字匹配度,权重,分词效果,这是后话,先把最基础的通过偏移量快速查找分享下。
/// <summary> /// 索引文件结构 /// </summary> struct Token { /// <summary> /// 关键字 /// </summary> public string ID; /// <summary> /// 移偏量 /// </summary> public int Offset; /// <summary> /// 长度 /// </summary> public int Length; }
/// <summary> /// 搜索 /// </summary> class Search { private static StringBuilder _mainContent = new StringBuilder(); //生成索引文件和数据文件 public void BuildFile() { //生成索引文件 if (File.Exists("index.txt")) File.Delete("index.txt"); using (FileStream aFile = new FileStream("index.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite)) { Random rd = new Random(); int rdv = 0; byte[] bytes = null; byte[] byCont = null; int offset = 0; int len = 0; using (BinaryWriter bw = new BinaryWriter(aFile, Encoding.UTF8)) { for (int i = 0; i < 15; i++) { rdv = rd.Next(10, 305000); string indexerid = string.Empty; indexerid = i.ToString() + DateTime.Today.ToString("yyyyMMdd"); _mainContent.Append(indexerid + "|test programe" + rdv.ToString()); string result = indexerid + "|test programe" + rdv.ToString(); bytes = System.Text.Encoding.UTF8.GetBytes(_mainContent.ToString()); //所有内容 byCont = System.Text.Encoding.UTF8.GetBytes(result); //本次内容 //计算偏移量和内容长度 if (i == 0) { offset = 0; len = byCont.Length; } else { offset = bytes.Length - byCont.Length; len = byCont.Length; } bw.Write(indexerid + "," + (offset) + "," + (len)); bw.Flush(); } } } //生成数据文件 if (File.Exists("data.txt")) File.Delete("data.txt"); using (FileStream dFile = new FileStream("data.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite)) { Random rd = new Random(); byte[] bytes = null; using (BinaryWriter bw = new BinaryWriter(dFile, Encoding.UTF8)) { bytes = System.Text.Encoding.Default.GetBytes(_mainContent.ToString()); bw.Write(_mainContent.ToString()); bw.Flush(); } } } //加载索引表到内存 public Dictionary<string, Token> GetTokenDic() { Dictionary<string, Token> dic = new Dictionary<string, Token>(); using (FileStream aFile = new FileStream("index.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { using (BinaryReader bw = new BinaryReader(aFile, Encoding.UTF8)) { for (int i = 0; i < 15; i++) { string result = bw.ReadString(); if (result.IndexOf(',') != -1) { string[] arr = result.Split(','); Token token = new Token(); token.ID = arr[0]; token.Length = Convert.ToInt32(arr[2]); token.Offset = Convert.ToInt32(arr[1]); dic.Add(arr[0], token); } } } } return dic; } //根据关键字,通过偏移量快速查找内容 public void ReadFile(string key) { Dictionary<string, Token> dic = GetTokenDic(); //char[] charData = null; FileStream file = new FileStream("data.txt", FileMode.Open); int dOffset = 0; int dLen = 0; Token t = new Token(); if (dic.TryGetValue(key, out t)) { dOffset = t.Offset; dLen = t.Length; } byte[] byData = new byte[dLen]; using (BinaryReader bw = new BinaryReader(file, Encoding.UTF8)) { file.Seek(dOffset + 2, SeekOrigin.Begin); file.Read(byData, 0, dLen); string d = Encoding.UTF8.GetString(byData); Console.WriteLine(d); file.Close(); } } }
static void Main(string[] args) { Search s = new Search(); //s.BuildFile(); s.ReadFile("1420130825"); Console.Read(); }