zoukankan      html  css  js  c++  java
  • 抓取 在线翻译(Google、Yahoo)

    最近公司要求我们做一个调用Google、Yahoo全文翻译的工具,在园子搜索了一下,找到了一篇(不大记得了,好像是VB写的),看了他的做法以后,做了一点点改进,发出来给需要的朋友看看,有什么不对的地方请大家扶正!


    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Threading;

    namespace Transn
    {
        
    class TsMachine
        
    {

            
    public string Google(string texts, string languages)
            
    {
                
    try
                
    {
                    
    return Google_T(texts, languages);
                }

                
    catch
                
    {
                    Thread.Sleep(
    1000);
                    
    try
                    
    {
                        
    return Google_T(texts, languages);
                    }

                    
    catch
                    
    {
                        
    return "";
                    }

                }

            }

            
    private string Google_T(string texts, string languages)
            
    {
                
    string payload = "&text=" + texts + "&langpair=" +languages;
                WebRequest req 
    = HttpWebRequest.Create(@"http://translate.google.com/translate_t?hl=zh-CN&ie=utf8");
                 
    //HttpWebRequest req = reqs as HttpWebRequest;
                 
    // req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Maxthon)";
                 
    // req.SendChunked = true;
                 
    // req.TransferEncoding = "UTF-8";
                
                req.Credentials 
    = CredentialCache.DefaultNetworkCredentials;
                req.Method 
    = "POST";
                req.ContentType 
    = "application/x-www-form-urlencoded";
                req.Timeout 
    = 50000;
                req.ContentLength 
    = payload.Length;
                

                Encoding encoding 
    = Encoding.GetEncoding("UTF-8");
                Byte[] bytes 
    = encoding.GetBytes(payload);
                req.ContentLength 
    = bytes.Length;
                Stream newStream 
    = null;

                newStream 
    = req.GetRequestStream();

                newStream.Write(bytes, 
    0, bytes.Length);
                newStream.Close();
                HttpWebResponse res 
    = (HttpWebResponse)req.GetResponse();
                
                
    if (res.StatusDescription.ToLower() != "ok")
                
    {
                    Console.WriteLine(
    "无法连接!");
                }

             
                Stream dataStream 
    = res.GetResponseStream();


                StreamReader reader 
    = new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

                
    string responseFromServer = null;
                responseFromServer 
    = reader.ReadToEnd();

                
    //Encoding en = Encoding.GetEncoding("gb2312");
                
    //byte[] unicodeBytes = en.GetBytes(responseFromServer);

                
    //byte[] asciiBytes = Encoding.Convert(en, Encoding.UTF8, unicodeBytes);
                
    //str_return = Encoding.UTF8.GetString(asciiBytes);

                reader.Close();
                dataStream.Close();
                res.Close();
                
    string content = responseFromServer;
                
    string s = "<div id=result_box dir=\"ltr\">";
                
    int start = content.IndexOf(s);
                
    int end = content.IndexOf("</div>",start);
                content 
    = content.Substring(start + s.Length, end - start - s.Length);

                
    return content.Replace("&nbsp;"" ").Replace("&quot;""\"").Replace("&gt;"">").Replace("&lt;""<").Replace("<br>""\r\n").Replace("&#160;"" ").Replace("&#39;","'");
            }




            
    public string Yahoo(string texts, string languages)
            
    {
                
    try
                
    {
                  
    return   Yahoo_T(texts, languages);
                }

                
    catch
                
    {
                    Thread.Sleep(
    1000);
                    
    try
                    
    {
                       
    return  Yahoo_T(texts, languages);
                    }

                    
    catch
                    
    {
                        
    return "";
                    }

                }

           
            }


            
    public string Yahoo_T(string texts, string languages)
            
    {
                
    string payload = "more=1&ei=UTF-8&trtext=" + texts + "&lp=" + languages + "";
                WebRequest req 
    = HttpWebRequest.Create(@"http://fanyi.yahoo.com.cn/translate_txt?");
                
    //  HttpWebRequest req = reqs as HttpWebRequest;
                
    //  req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Maxthon)";
                req.Credentials = CredentialCache.DefaultNetworkCredentials;
                req.Method 
    = "POST";
                req.ContentType 
    = "application/x-www-form-urlencoded";
                req.Timeout 
    = 50000;
                req.ContentLength 
    = payload.Length;

                Encoding encoding 
    = Encoding.GetEncoding("UTF-8");
                Byte[] bytes 
    = encoding.GetBytes(payload);
                req.ContentLength 
    = bytes.Length;
                Stream newStream 
    = null;

                newStream 
    = req.GetRequestStream();

                newStream.Write(bytes, 
    0, bytes.Length);
                newStream.Close();
                HttpWebResponse res 
    = (HttpWebResponse)req.GetResponse();
                
    if (res.StatusDescription.ToLower() != "ok")
                
    {
                    Console.WriteLine(
    "无法连接!");
                  
                }

                Stream dataStream 
    = res.GetResponseStream();


                StreamReader reader 
    = new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

                
    string responseFromServer = null;
                responseFromServer 
    = reader.ReadToEnd();

                
    //Encoding en = Encoding.GetEncoding("gb2312");
                
    //byte[] unicodeBytes = en.GetBytes(responseFromServer);

                
    //byte[] asciiBytes = Encoding.Convert(en, Encoding.UTF8, unicodeBytes);
                
    //str_return = Encoding.UTF8.GetString(asciiBytes);

                reader.Close();
                dataStream.Close();
                res.Close();

                
    string content = responseFromServer;
                
    string s = "<div id=\"pd\" class=\"pd\">";
                
    int start = content.IndexOf(s);
                
    int end = content.IndexOf("</div>\n\t\t\t</div>", start);
                content 
    = content.Substring(start + s.Length, end - start - s.Length);
                
    return content.Replace("<dnt>""").Replace("</dnt>","").Replace("<br/>","\r\n");

            }

    }
    加载翻译方向
       struct LanguageType
            
    {
                
    public string value;
                
    public string text;
                
    public LanguageType(string v, string t)
                
    {
                    value 
    = v;
                    text 
    = t;
                }


                
    public override string ToString()
                
    {
                    
    return text;
                }

            }


      private void LoadLanguage(ComboBox comboBox)
            
    {
                comboBox.Items.Add(
    new LanguageType("ar|en""阿拉伯文到英语"));
                comboBox.Items.Add(
    new LanguageType("ko|en""朝鲜语到英语"));
                comboBox.Items.Add(
    new LanguageType("de|fr""德语到法语"));
                comboBox.Items.Add(
    new LanguageType("de|en""德语到英语"));
                comboBox.Items.Add(
    new LanguageType("ru|en""俄语到英语"));
                comboBox.Items.Add(
    new LanguageType("fr|de""法语到德语"));
                comboBox.Items.Add(
    new LanguageType("fr|en""法语到英语"));
                comboBox.Items.Add(
    new LanguageType("nl|en""荷兰语到英语"));
                comboBox.Items.Add(
    new LanguageType("pt|en""葡萄牙语到英语"));
                comboBox.Items.Add(
    new LanguageType("ja|en""日语到英语"));
                comboBox.Items.Add(
    new LanguageType("es|en""西班牙语到英语"));
                comboBox.Items.Add(
    new LanguageType("el|en""希腊语到英语"));
                comboBox.Items.Add(
    new LanguageType("it|en""意大利语到英语"));
                comboBox.Items.Add(
    new LanguageType("en|ar""英语到阿拉伯文"));
                comboBox.Items.Add(
    new LanguageType("en|ko""英语到朝鲜语"));
                comboBox.Items.Add(
    new LanguageType("en|de""英语到德语"));
                comboBox.Items.Add(
    new LanguageType("en|ru""英语到俄语"));
                comboBox.Items.Add(
    new LanguageType("en|fr""英语到法语"));
                comboBox.Items.Add(
    new LanguageType("en|nl""英语到荷兰语"));
                comboBox.Items.Add(
    new LanguageType("en|pt""英语到葡萄牙语"));
                comboBox.Items.Add(
    new LanguageType("en|ja""英语到日语"));
                comboBox.Items.Add(
    new LanguageType("en|es""英语到西班牙语"));
                comboBox.Items.Add(
    new LanguageType("en|el""英语到希腊语"));
                comboBox.Items.Add(
    new LanguageType("en|it""英语到意大利语"));
                comboBox.Items.Add(
    new LanguageType("en|zh-TW""英语到中文(繁体)"));
                comboBox.Items.Add(
    new LanguageType("en|zh-CN""英语到中文(简体)"));
                comboBox.Items.Add(
    new LanguageType("en|zh-CN""英语到中文"));
                comboBox.Items.Add(
    new LanguageType("zh|en""中文到英语"));
                comboBox.Items.Add(
    new LanguageType("zh-TW|zh-CN""中文(繁体到简体)"));
                comboBox.Items.Add(
    new LanguageType("zh-CN|zh-TW""中文(简体到繁体)"));
            }


    调用方法
      void GoogleT(TsMachine tm, string content, string languetype)
            
    {
                
    string tranlate = tm.Google(content, languetype);
                Google.Text 
    = tranlate;
            }


            
    void YahooT(TsMachine tm, string content, string languetype)
            
    {
                languetype 
    = languetype.Replace("|""_").Replace("zh-TW""zt").Replace("zh-CN""zh");
                
    string tranlate = tm.Yahoo (content, languetype);
                Yahoo.Text 
    = tranlate;
            }



    在这个请求翻译的过程中,最麻烦的是编码问题。Yahoo使用的是固定编码格式(Utf-8)、Google就很不老实了,编码格式是变化的,每次调用 StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding(Utf-8)); 进行解码的时候老是出错,后来才发现每次编码格式都在改变,如果用固定的Utf-8解码,得到的都是乱码
    根据多次测试,Google的编码格式和res.CharacterSet一致(不敢保证,但是测试了很多语种都是正确的)。 虽然摆平了Google和Yahoo,但是谷词一直没能搞定

    这是我编写的谷词翻译编码

       public string Godict_T(string texts, string languages)
            
    {
                
    string payload = "from_content=" + texts + "&langpair=" + languages + "";
                WebRequest req 
    = HttpWebRequest.Create(@"http://trans.godict.com/index.php");
                req.Credentials 
    = CredentialCache.DefaultNetworkCredentials;
                req.Method 
    = "POST";
                req.ContentType 
    = "application/x-www-form-urlencoded";
                req.Timeout 
    = 50000;
                req.ContentLength 
    = payload.Length;

                Encoding encoding 
    = Encoding.GetEncoding("UTF-8");
                Byte[] bytes 
    = encoding.GetBytes(payload);
                req.ContentLength 
    = bytes.Length;
                Stream newStream 
    = null;

                newStream 
    = req.GetRequestStream();

                newStream.Write(bytes, 
    0, bytes.Length);
                newStream.Close();
                HttpWebResponse res 
    = (HttpWebResponse)req.GetResponse();
                
    if (res.StatusDescription.ToLower() != "ok")
                
    {
                    Console.WriteLine(
    "无法连接!");
                }

                Stream dataStream 
    = res.GetResponseStream();


                StreamReader reader 
    = new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

                
    string responseFromServer = null;
                responseFromServer 
    = reader.ReadToEnd();


                reader.Close();
                dataStream.Close();
                res.Close();

                
    string content = responseFromServer;
                
    string s = "<textarea name=q rows=12 style=\" 740px;\" wrap=PHYSICAL dir=ltr>";
                
    int start = content.IndexOf(s);
                
    int end = content.IndexOf("</textarea>", start);
                
    return content.Substring(start + s.Length, end - start - s.Length);
            }

    结果发现这个谷词的res.CharacterSet(相应流)永远是iso-8859-1 ,这就郁闷了。 不知道这个地方如何解码,有知道的朋友请指教 !

  • 相关阅读:
    【转贴】Cookie + Session + OAuth + SSO
    zz淘宝商品库MySQL优化实践
    HIVE 数据倾斜调优总结zz
    数据挖掘笔记(一)
    hive函数参考手册
    hive QL(HQL)简明指南zz
    数据挖掘笔记(二)
    python format string (转)
    hive 中转义符使用问题
    关于文档管理
  • 原文地址:https://www.cnblogs.com/moses/p/1148735.html
Copyright © 2011-2022 走看看