zoukankan      html  css  js  c++  java
  • 【C#爬虫】抓取XX网站mp4资源地址

    抓取小视频的url地址,然后将地址信息拷贝到迅雷里批量下载就ok了

    主程序 代码

                //yazhouqingseAV 35
                //zhifusiwaAV 29
                //zipaishipin 30
                //oumeiqingseAV 28
                //katongdongman 31 
                //tongxingAV 32
                //sanjidianying 33
                //fengkuangqunjiao 34
    
                var client = new WinHttpHelper();
                var type = "fengkuangqunjiao";
                var classid = 34;
    
                for (int i = 1; i > -1; i++)
                {
                    Console.WriteLine(i);
                    var index = "_" + i;
                    if (i == 1)
                        index = "";
    
                    string pageUrl = "http://www.lang34.com/se/" + type + "/index" + index + ".html";
    
                    var trs = RegexHelper.GetMathList(client.GET(pageUrl, Encoding.UTF8), "" + type + "/(.*?).html");
                    foreach (var item in trs)
                    {
                        string temp = "";
                        if (RegexHelper.GetMatchStr(item.ToString(), "" + type + "/(.*?).html", true, out temp))
                        {
                            string url = "http://www.lang34.com/e/DownSys/play/?classid=" + classid + "&id=" + temp + "&pathid=0";
                            string htmltext = client.GET(url, Encoding.UTF8);
    
                            string mp4 = "";
                            if (RegexHelper.GetMatchStr(htmltext, "f:'(.*?)',", true, out mp4))
                            {
                                string titile = "";
                                RegexHelper.GetMatchStr(htmltext, " <title>(.*?)</title>", true, out titile);
    
                                string output = mp4 + "?title" + titile + "
    ";
                                Console.WriteLine(output);
                                File.AppendAllText("D://" + type + ".txt", output);
                            }
                        }
    
                    }
                }

    网络请求类

    using System;
    using System.Collections.Generic;
    using System.Text;
    
    namespace MyHelper4Web
    {
        public class WinHttpHelper
        {
            WinHttp.WinHttpRequest request;
    
            public string Accept = "*/*";
            public string UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; InfoPath.2; .NET4.0E)";
            public string ContentType = "application/json";// "application/x-www-form-urlencoded";
            public int SetTimeOut = 60;//请求超时时间秒
            public bool AllowAutoRedirect = true;//是否允许自动跳转
            public bool AllowHttpstoHttp = false;//是否允许http与https转换
    
            public WinHttpHelper()
            {
                request = new WinHttp.WinHttpRequest();
            }
    
            /// <summary>
            /// 传入请求头的HttpHelper构造函数
            /// </summary>
            /// <param name="Accept">Accept</param>
            /// <param name="UserAgent">UserAgent</param>
            /// <param name="ContentType">ContentType</param>
            public WinHttpHelper(string Accept, string UserAgent, string ContentType)
            {
                this.Accept = Accept;
                this.UserAgent = UserAgent;
                this.ContentType = ContentType;
            }
    
            /// <summary>
            /// 传入请求头的HttpHelper构造函数
            /// </summary>
            /// <param name="Accept">Accept</param>
            /// <param name="UserAgent">UserAgent</param>
            /// <param name="ContentType">ContentType</param>
            /// <param name="SetTimeOut">SetTimeOut</param>
            public WinHttpHelper(string Accept, string UserAgent, string ContentType, int SetTimeOut)
            {
                this.Accept = Accept;
                this.UserAgent = UserAgent;
                this.ContentType = ContentType;
                this.SetTimeOut = SetTimeOut;
            }
    
            /// <summary>
            /// GET方式请求网页
            /// </summary>
            /// <param name="Url">请求的url</param>
            /// <returns>以字节数组形式返回响应内容</returns>
            public byte[] GET(string Url,string refer)
            {
                byte[] responsebody;
                try
                {
                    //不允许自动跳转
                    if (AllowAutoRedirect == false)
                    {
                        request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
                    }
                    //允许https与http转换
                    if (AllowHttpstoHttp == true)
                    {
                        request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
                    }
                    request.Open("GET", Url, true);
                    request.SetRequestHeader("Accept", Accept);
                    request.SetRequestHeader("User-Agent", UserAgent);
                    if (!string.IsNullOrEmpty(refer))
                    {
                        request.SetRequestHeader("Referer", refer);
                    }
                    request.Send("");
                    request.WaitForResponse(SetTimeOut);
                    responsebody = (byte[])request.ResponseBody;
                }
                catch (Exception ex)
                {
                    responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
                    ////LogHelper.Log.Error("GET方式请求网页异常", ex);
                }
                return responsebody;
            }
    
            /// <summary>
            /// GET方式请求网页
            /// </summary>
            /// <param name="Url">请求的url</param>
            /// <param name="Encode">转换字符串用的编码</param>
            /// <returns>以字符串形式返回响应内容</returns>
            public string GET(string Url, Encoding Encode)
            {
                string htmltext = "";
                try
                {
                    byte[] htmlbyte = GET(Url,"");
                    htmltext = Encode.GetString(htmlbyte);
                }
                catch (Exception ex)
                {
                    htmltext = ex.Message + ex.Source;
                    ////LogHelper.Log.Error("GET方式请求网页异常", ex);
                }
                return htmltext;
            }
    
            public string GET(string Url,string refer , Encoding Encode)
            {
                byte[] htmlbyte = GET(Url, refer);
    
                return  Encode.GetString(htmlbyte);
            }
    
            /// <summary>
            /// POST方式请求网页
            /// </summary>
            /// <param name="Url">请求的Url</param>
            /// <param name="PostData">请求传的值</param>
            /// <param name="Refer">Refer</param>
            /// <returns>以字节数组形式返回响应内容</returns>
            public byte[] POST(string Url, string PostData, string Refer)
            {
                byte[] responsebody;
                try
                {
                    //不允许自动跳转
                    if (AllowAutoRedirect == false)
                    {
                        request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
                    }
                    //允许https与http转换
                    if (AllowHttpstoHttp == true)
                    {
                        request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
                    }
                    request.Open("POST", Url, true);
                    request.SetRequestHeader("Accept", Accept);
                    request.SetRequestHeader("User-Agent", UserAgent);
                    request.SetRequestHeader("Content-Type", ContentType);
                    if (!string.IsNullOrEmpty(Refer))
                    {
                        request.SetRequestHeader("Referer", Refer);
                    }
                    request.Send(PostData);
                    request.WaitForResponse(SetTimeOut);
                    responsebody = (byte[])request.ResponseBody;
                }
                catch (Exception ex)
                {
                    responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
                    ////LogHelper.Log.Error("POST方式请求网页异常", ex);
                }
                return responsebody;
            }
    
            /// <summary>
            /// POST方式请求网页
            /// </summary>
            /// <param name="Url">请求的Url</param>
            /// <param name="PostData">请求传的值</param>
            /// <returns>以字节数组形式返回响应内容</returns>
            public byte[] POST(string Url, string PostData)
            {
                byte[] responsebody;
                responsebody = POST(Url, PostData, "");
                return responsebody;
            }
    
            /// <summary>
            /// POST方式请求网页
            /// </summary>
            /// <param name="Url">请求的Url</param>
            /// <param name="PostData">请求传的值</param>
            /// <param name="Refer">Refer</param>
            /// <param name="Encode">转换字符串用的编码</param>
            /// <returns>以字符串形式返回响应内容</returns>
            public string POST(string Url, string PostData, string Refer, Encoding Encode)
            {
                string htmltext = string.Empty;
                try
                {
                    byte[] responsebody = POST(Url, PostData, Refer);
                    htmltext = Encode.GetString(responsebody);
                }
                catch (Exception ex)
                {
                    htmltext = ex.Message + ex.Source;
                    ////LogHelper.Log.Error("POST方式请求网页异常", ex);
                }
                return htmltext;
            }
    
            /// <summary>
            /// POST方式请求网页
            /// </summary>
            /// <param name="Url">请求的Url</param>
            /// <param name="PostData">请求传的值</param>
            /// <param name="Encode">转换字符串用的编码</param>
            /// <returns>以字符串形式返回响应内容</returns>
            public string POST(string Url, string PostData, Encoding Encode)
            {
                string htmltext = string.Empty;
                try
                {
                    byte[] responsebody = POST(Url, PostData, "");
                    htmltext = Encode.GetString(responsebody);
                }
                catch (Exception ex)
                {
                    htmltext = ex.Message + ex.Source;
                    ////LogHelper.Log.Error("POST方式请求网页异常", ex);
                }
                return htmltext;
            }
    
            public string GetAllCookis()
            {
                string cookis = "";
                try
                {
                    cookis = request.GetAllResponseHeaders();
                }
                catch (Exception)
                {
                    return "";
                }
                return cookis;
            }
        }
    }

    正则表达式类

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Collections;
    
    namespace MyHelper4Web
    {
        public class RegexHelper
        {
            /// <summary>
            /// 
            /// </summary>
            /// <param name="htmltext"></param>
            /// <param name="pattern"></param>
            /// <param name="isCut"></param>
            /// <param name="result"></param>
            /// <returns></returns>
            public static bool GetMatchStr(string htmltext, string pattern, bool isCut, out string result)
            {
                bool IsGetSuccess = false;
                result = "";
                try
                {
                    IsGetSuccess = GetMatchStr(htmltext, pattern, out result);
                    if (!isCut)
                    {
                        string[] replaceStrs = new string[2];
                        if (pattern.Contains("(.*?)"))
                        {
                            string splitStr = pattern.Replace("(.*?)", "|");
                            replaceStrs = splitStr.Split('|');
                        }
                        result = replaceStrs[0] + result + replaceStrs[1];
                    }
                }
                catch (Exception ex)
                {
                    IsGetSuccess = false;
                }
    
                return IsGetSuccess;
            }
    
            public static string GetMatchString(string htmltext, string pattern, bool isCut)
            {
              string    result = "";
                try
                {
                    GetMatchStr(htmltext, pattern, out result);
                    if (isCut)
                    {
                        string[] replaceStrs = new string[2];
                        if (pattern.Contains("(.*?)"))
                        {
                            string splitStr = pattern.Replace("(.*?)", "|");
                            replaceStrs = splitStr.Split('|');
                        }
                        result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], ""); 
                    }
                    return result;
                }
                catch (Exception ex)
                {
                    return "";
                }
                
            }
    
    
            /// <summary>
            /// 正则表达式dan匹配方法
            /// </summary>
            /// <param name="htmltext">网页内容</param>
            /// <param name="pattern">模式字符串</param>
            /// <param name="result">返回匹配成功的字符串</param>
            /// <returns>匹配是否成功</returns>
            public static bool GetMatchStr(string htmltext, string pattern, out string result)
            {
                bool IsGetSuccess = false;
                result = "";
                try
                {
                    string[] replaceStrs=new string[2];
                    if (pattern.Contains("(.*?)"))
                    {
                        string splitStr = pattern.Replace("(.*?)", "^");
                        replaceStrs = splitStr.Split('^');
                    }
                    Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                    Match match = regex.Match(htmltext);
                    if (match.Success)
                    {
                        result = match.ToString();
                        result = result.Replace(replaceStrs[0], "").Replace(replaceStrs[1], "");
                    }
                    else
                    {
                        IsGetSuccess = false;
                    }
                }
                catch (Exception ex)
                {
                    IsGetSuccess = false;
                }
                finally
                {
                    if (!string.IsNullOrEmpty(result))
                    {
                        IsGetSuccess = true;
                    }
                    else
                    {
                        IsGetSuccess = false;
                    }
                }
                return IsGetSuccess;
            }
    
            /// <summary>
            /// 正则多匹配,返回匹配ArrayList数组
            /// </summary>
            /// <param name="htmltext">网页内容</param>
            /// <param name="pattern">模式字符串</param>
            /// <returns></returns>
            public static ArrayList GetMathList(string htmltext, string pattern)
            {
                ArrayList list = new ArrayList();
                try
                {
                    MatchCollection mc;
                    //定义一个Regex对象实例 
                    Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                    //或者多行匹配模式RegexOptions.Multiline  
                    mc = regex.Matches(htmltext);
                    //在输入字符串中找到所有匹配
                    for (int i = 0; i < mc.Count; i++)
                    {
                        //匹配一条信息就处理
                        string groupcode = mc[i].Value.ToString();
                        //处理函数
                        list.Add(groupcode);
                    }
                }
                catch (Exception)
                {
                    return null;
                }
                return list;
            }
    
            ///// <summary>
            ///// 正则表达式duo匹配方法
            ///// </summary>
            ///// <param name="htmltext">网页内容</param>
            ///// <param name="patterns">模式字符串数组</param>
            ///// <param name="result">返回匹配成功的字符串</param>
            ///// <returns>匹配是否成功</returns>
            //public static bool GetMathStr(string htmltext, string[] patterns, out string result)
            //{
            //    bool IsGetSuccess = false;
            //    result = "";
            //    try
            //    {
            //        string temp = htmltext;
            //        for (int i = 0; i < patterns.Length; i++)
            //        {
            //            Regex regex = new Regex(patterns[i], RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //            Match match = regex.Match(temp);
            //            if (match.Success)
            //            {
            //                temp = match.ToString();
            //                if (i == patterns.Length - 1)
            //                {
            //                    result = temp;
            //                }
            //            }
            //            else
            //            {
            //                break;
            //            }
            //        }
            //    }
            //    catch (Exception ex)
            //    {
            //        IsGetSuccess = false;
            //    }
            //    finally
            //    {
            //        if (!string.IsNullOrEmpty(result))
            //        {
            //            IsGetSuccess = true;
            //        }
            //        else
            //        {
            //            IsGetSuccess = false;
            //        }
            //    }
            //    return IsGetSuccess;
            //}
        }
    }
  • 相关阅读:
    Codeforces Round #411 (Div. 2)
    腾讯比赛资料
    AtCoder Beginner Contest 060
    hdu 5288 数学 ****
    hdu 1866 几个矩形面积的和 ***
    hdu 2232 矩阵 ***
    bzoj 1415 期望+记忆化搜索 ****
    hdu 5033 单调栈 ****
    hdu 3032 sg打表找规律 *
    hdu 2516 FIB博弈
  • 原文地址:https://www.cnblogs.com/jhli/p/5915242.html
Copyright © 2011-2022 走看看