zoukankan      html  css  js  c++  java
  • 用Erlang写一个抓写真图片的程序

    某论坛有写真图片,时不时会更新,写一个Erlang抓图片的程序,一来有需求,二来练习一下Erlang编程。

    好的,那么开始。

    首先,程序需要分成几个模块,一个处理网络请求的模块httpDownload,一个处理网络返回数据的模块dealPageData,控制流程的模块downImg。由于网络请求需要控制并发数量,还需要构建一个信号量模块semaphore。

    首先是semaphore.erl的代码

    -module(semaphore).
    -export([start/1, stop/0]).
    -export([wait/0, signal/0]).
    
    start(ResCount) ->
            register(mutex, spawn(fun() -> init(ResCount) end)).
        
    stop() ->
            mutex ! stop.
        
    wait() ->
            mutex ! {wait, self()},
            receive ok -> ok end.
        
        
    signal() ->
            mutex ! {signal, self()}, ok.
        
    init(InitialValue) ->
            free(InitialValue).
        
    free(0) ->
            io:format("wait, busy~n", []),
            busy();
    free(Available) ->
            receive
                {wait, Pid} ->
                    Pid ! ok,
                    io:format("wait, ok~n", []),
                    free(Available - 1);
                {signal, _} ->
                    io:format("signal, ok~n", []),
                    free(Available + 1);
                stop ->
                    terminate()
            end.
        
        
    busy() ->
            receive
                {signal, _} ->
                    io:format("signal, ok~n", []),
                    free(1)
            end.
        
        
    terminate() ->
            receive
                {wait, Pid} ->
                    exit(Pid, kill),
                    terminate()
            after
                0 -> ok
            end.
        

    然后在semaphore的基础上可以构建httpDownload.erl

    -module(httpDownload).
    -export([request/2, init/1, stop/0]).
    
    init(DownloadThreadCount) ->
            semaphore:start(DownloadThreadCount),
            inets:start(),
            register(requestable, spawn(fun() -> theMapLoop(sets:new()) end)),
            register(queueCount, spawn(fun() -> requestQueueCount(0) end)),
            register(httpDownload, spawn(fun() -> loop() end)),
            ok.
            
    
    stop() -> httpDownload ! exit.
    
    
    requestQueueCount(Count) ->
            NewCount = receive
                add -> Count + 1;
                remove -> Count - 1
            after 
                1000 -> Count
            end,
            io:format("--------------   ~p   --------------~n", [NewCount]),
            requestQueueCount(NewCount).
    
    
    requestThread(From, Url, State) ->
            case httpc:request(Url) of
                {ok, {_, _, Data} } ->
                    io:format("ok request: ~p~n", [Url]),
                    From ! {ok, {State, {Url, Data} } };
                _ ->
                    From ! {error, Url ++ " request failed"}
            end,
            semaphore:signal().
    
    
    theMapLoop(Set) ->
            receive
                {From, El} ->
                    case sets:is_element(El, Set) of
                        true -> From ! false, theMapLoop(Set);
                        false -> From ! true, theMapLoop(sets:add_element(El, Set))
                    end;
                _ -> theMapLoop(Set)
            end.
    
    
    loop() ->
            receive
                {From, Url, State} ->
                    semaphore:wait(),
                    %io:format("begin request: ~p~n", [Url]),
                    queueCount ! remove,
                    spawn(fun() -> requestThread(From, Url, State) end),
                    loop();
                exit ->
                    stop
            end.
    
    
    
    
    request(Url, State) ->
            %io:format("request: ~p~n", [Url]),
            requestable ! {self(), Url},
            receive
                true -> 
                    queueCount ! add,
                    httpDownload ! {self(), Url, State};
                false -> io:format("request already exists: ~p~n", [Url])
            end,
            ok.
        
        

    接下来是处理网络请求数据的dealPageData.erl

    -module(dealPageData).
    -export([deal/2, fileExists/1]).
    
    
    fileExists(Url) ->
            FileFullPath = <<(imgSavePath())/binary, (binary:replace(binary:list_to_bin(Url), [<<":">>,<<"/">>],<<"_">>,[global]))/binary>>,
            case file:read_file_info(FileFullPath) of
                {ok, _} -> true;
                _ -> {false, FileFullPath}
            end.
    
        
    imgSavePath() -> <<"d:/images/">>.
        
    
    captureData(_, [], Result) ->
            Result;
    captureData(Data, [Capture|T], Result) ->
            {Start, Len} = lists:last(Capture),
            captureData(Data, T, [string:substr(Data, Start + 1, Len) | Result]).
        
    deal(main, {_, Data}) ->
            case re:compile("thread-[0-9]+-1-1.html") of
                {ok, Reg} ->
                    case re:run(Data, Reg, [global]) of
                        {match, Captured} ->
                            {main, captureData(Data, Captured, [])};
                        nomatch ->
                            {error, "nomatch page link at main page"}
                    end;
                {error, _} ->
                    {error, "error when create page link regexp"}
            end;
    deal(page, {_, Data}) ->
            case  re:compile("src=\"([^\"]+\\.jpg)\"") of
                {ok, Reg} ->
                    case re:run(Data, Reg, [global]) of
                        {match, Captured} ->
                            {page, captureData(Data, Captured, [])};
                        nomatch ->
                            {error, "nomatch image link at page"}
                    end;
                {error, _} ->
                    {error, "error when create image link regexp"}
            end;
    deal(img, {Url, Data}) ->
            case fileExists(Url) of
                true -> {error, "file already exists"};
                {false, FilePath} -> 
                    case file:write_file(FilePath, Data) of
                        ok -> {img, Url ++ " saved"};
                        {error, _} -> {error, "error when save file"}
                    end
            end.
    
        

    最后是将这些功能模块串起来的downImg.erl

    -module(downImg).
    -export([start/0]).
    
    
    
    httpRoot() -> "http://f1.avzcf.info/bbs/".
        
    mainPage() -> "forum-13-1.html".
    
    
    loop() ->
            receive
                {ok, {State, Data} } ->
                    case dealPageData:deal(State, Data) of
                        {main,  StrList} -> 
                            lists:foreach(fun(Str) -> httpDownload:request((httpRoot() ++ Str), page) end, StrList);
                            
                        {page, StrList} ->
                            lists:foreach(fun(Str) -> 
                                case dealPageData:fileExists(Str) of
                                    true -> noDownload;
                                    {false, _} -> httpDownload:request(Str, img)
                                end
                            end, StrList);
                            
                        {img, Msg} ->
                            io:format("~p~n",[Msg]);
                            
                        {error, Msg} ->
                            io:format("~p~n",[Msg])
                    end;
                {error, Msg} ->
                    io:format("~p~n",[Msg])
            end,
            loop().
    
    
    start() -> 
            httpDownload:init(3),
            httpDownload:request((httpRoot() ++ mainPage()), main),
            loop().

    代码贴完,start函数启动一个同时发起3个网络请求的任务。

  • 相关阅读:
    mysql那些事(1)手机号与座机号码如何存储
    分享一个PHP调用RestFul接口的函数
    php sprintf用法
    HTTP状态码详解
    PHP随机生成中国人姓名的类
    PHP计算两组经纬度坐标之间的距离
    PHP根据经纬度获取在范围坐标的数据
    PHP 利用QQ邮箱发送邮件「PHPMailer」
    PHP中利用PHPMailer配合QQ邮箱实现发邮件
    修改PHP上传文件大小限制
  • 原文地址:https://www.cnblogs.com/gibbon/p/2550037.html
Copyright © 2011-2022 走看看