zoukankan      html  css  js  c++  java
  • C# 分析搜索引擎url 得到搜索关键字

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    
    namespace HbTui.Portal
    {
        public class SearchKeyWord
        {
            private string[][] _Enginers = new string[][]
            {       
                new string[]{"google","utf8","q"},
                new string[]{"baidu","gb2312","wd"},
                new string[]{"yahoo","utf8","p"},
                new string[]{"yisou","utf8","search"},
                new string[]{"live","utf8","q"},
                new string[]{"tom","gb2312","word"},
                new string[]{"163","gb2312","q"},
                new string[]{"iask","gb2312","k"},
                new string[]{"soso","gb2312","w"},
                new string[]{"sogou","gb2312","query"},
                new string[]{"zhongsou","gb2312","w"},
                new string[]{"3721","gb2312","p"},
                new string[]{"openfind","utf8","q"},
                new string[]{"alltheweb","utf8","q"},
                new string[]{"lycos","utf8","query"},
                new string[]{"youdao","utf8","q"},
                new string[]{"onseek","utf8","q"},
                new string[]{"jike","utf8","q"}
            };
    
            //搜索引擎名称
            private string _EngineName = string.Empty;
            public string EngineName
            {
                get
                {
                    return _EngineName;
                }
            }
    
            //搜索引擎编码
            private string _Coding = "utf8";
            public string Coding
            {
                get
                {
                    return _Coding;
                }
            }
            //搜索引擎关键字查询参数名称
            private string _RegexWord = "";
            public string RegexWord
            {
                get
                {
                    return _RegexWord;
                }
            }
    
            private string _Regex = @"(";
    
            //搜索引擎关键字
            //建立搜索关键字正则表达式
            public void EngineRegEx(string myString)
            {
                for (int i = 0, j = _Enginers.Length; i < j; i++)
                {
                    if (myString.Contains(_Enginers[i][0]))
                    {
                        _EngineName = _Enginers[i][0];
                        _Coding = _Enginers[i][1];
                        _RegexWord = _Enginers[i][2];
                        _Regex += _EngineName + @".+.*[?/&]" + _RegexWord + @"[=:])(?<key>[^&]*)";
                        break;
                    }
                }
            }
    
            //得到搜索引擎关键字
            public string SearchKey(string myString)
            {
                EngineRegEx(myString.ToLower());
                if (_EngineName != "")
                {
                    Regex myReg = new Regex(_Regex, RegexOptions.IgnoreCase);
                    Match matche = myReg.Match(myString);
                    myString = matche.Groups["key"].Value;
                    //去处表示为空格的+
                    myString = myString.Replace("+", " ");
                    if (_Coding == "gb2312")
                    {
                        myString = GetUTF8String(myString);
                    }
                    else
                    {
                        myString = Uri.UnescapeDataString(myString);
                    }
                }
                return myString;
            }
    
            //整句转码
            public string GetUTF8String(string myString)
            {
                Regex myReg = new Regex("(?<key>%..%..)", RegexOptions.IgnoreCase);
                MatchCollection matches = myReg.Matches(myString);
                string myWord;
                for (int i = 0, j = matches.Count; i < j; i++)
                {
                    myWord = matches[i].Groups["key"].Value.ToString();
                    myString = myString.Replace(myWord, GB2312ToUTF8(myWord));
                }
                return myString;
            }
            //单字GB2312转UTF8 URL编码
            public string GB2312ToUTF8(string myString)
            {
                string[] myWord = myString.Split('%');
                byte[] myByte = new byte[] { Convert.ToByte(myWord[1], 16), Convert.ToByte(myWord[2], 16) };
                Encoding GB = Encoding.GetEncoding("GB2312");
                Encoding U8 = Encoding.UTF8;
                myByte = Encoding.Convert(GB, U8, myByte);
                char[] Chars = new char[U8.GetCharCount(myByte, 0, myByte.Length)];
                U8.GetChars(myByte, 0, myByte.Length, Chars, 0);
                return new string(Chars);
            }
    
            //判断否为搜索引擎爬虫,并返回其类型
            public string isCrawler(string SystemInfo)
            {
                string[] BotList = new string[] { "Google", "Baidu", "yisou", "MSN", "Yahoo", "live", "tom", "163", "TMCrawler", "iask", "Sogou", "soso", "youdao", "zhongsou", "3721", "openfind", "alltheweb", "lycos", "bing", "118114" };
                foreach (string Bot in BotList)
                {
                    if (SystemInfo.ToLower().Contains(Bot.ToLower()))
                    {
                        return Bot;
                    }
                }
                return "null";
            }
            public bool IsSearchEnginesGet(string str)
            {
                string[] strArray = new string[] { "Google", "Baidu", "yisou", "MSN", "Yahoo", "live", "tom", "163", "TMCrawler", "iask", 
                "Sogou", "soso", "youdao", "zhongsou", "3721", "openfind","alltheweb", "lycos", "bing", "118114"};
                str = str.ToLower();
                for (int i = 0; i < strArray.Length; i++)
                {
                    if (str.IndexOf(strArray[i].ToLower()) >= 0)
                    {
                        return true;
                    }
                }
                return false;
            }
        }
    }
  • 相关阅读:
    JD笔试试题(凭记忆写的+人生感悟 try finally )
    ZOJ 3626 Treasure Hunt I(树形dp)
    Oracle数据库有用函数
    leetcode
    BIEE11G系统数据源账号过期问题(默认安装步骤)
    class文件结构浅析(2)
    使用Lua 局部变量来优化性能,同一时候比較局部变量和全局变量
    Linux基本配置和管理 3 ---- Linux命令行文本处理工具
    android面试题及答案
    CentOS 6.4的安装--史上最全-CRPER木木
  • 原文地址:https://www.cnblogs.com/niaowo/p/4022442.html
Copyright © 2011-2022 走看看