zoukankan      html  css  js  c++  java
  • 火车头伪原创,火车头采集器php插件

    火车头伪原创之所以使用多个NLP项目,是因为百度AI本来就是用来完成整个项目的。但是由于火车头采集器php插件的自然语言处理API,对于普通用户来说,通话次数是有限制的,超出部分是要收费的,所以数据量大的处理会交给HanLP项目。减少数据量分词交给百度AI处理。

    至于同义词库文件的位置,不建议移动或更改。文件夹和名称都一样。有时间跟进,再优化这个问题。

    火车头采集器php插件源码:

    <?php
    
    
    set_time_limit(270);
    error_reporting(E_ERROR | E_WARNING | E_PARSE);
    
    define('TITLE_SEPAR', 'xxx**xxx');
    define('TITLE_SEPAR2', '262661');
    
    
    $url = 'http://api-2.xiaofamao.com/api.php?json=0&v=1&key=aa740455217';
    
    $content_tag_name = '内容';
    $realtime_keyword = '锁词'; // 关键词,小发猫,伪原创     用英文逗号隔开
    $title_tag_name = '标题';
    
    
    
    $headdd = '';
    $taill = '';
    
    
    
    switch($LabelArray['PageType'])
    {
        case 'List'://处理列表页,只能处理html
            break;
        case 'Pages'://处理多页,只能处理html
            break;
        case 'Content'://处理默认页,只能处理html
            break;
        case 'Save'://只有保存时是可以处理标签值的
            // 保存原文
    
        try {
        /**********************************************************************/
        // 这一步用来获取伪原创文章
        /**********************************************************************/
        $title = $LabelArray[$title_tag_name];
        $title = '标题:'.$title;
    
        $content = $LabelArray[$content_tag_name];
    
    
        $realtime_keyword = $LabelArray[$realtime_keyword]; // 动态锁词
    
    
    
        $article_src = compose_article($title, $content);
        $article_src_b = $article_src;
    
        $article_new = get_wyc_article($article_src, $realtime_keyword);
    
    
        $title_wyc = trim($article_new[0]);
        $content_wyc = trim($article_new[1]);
    
        $content_wyc = fix_newline($content_wyc);
        $content_wyc = str_replace('标签:', '标签:', $content_wyc);
        $LabelArray[$content_tag_name] = $headdd. $content_wyc. $taill;
    
        $title_wyc = str_replace(array('。',',','%'), array(' ',' ',' '), $title_wyc);
        $LabelArray[$title_tag_name] = $title_wyc;
    
        }
        catch (Exception $e) {
            $LabelArray['标题'] .= $e->getMessage();
            $LabelArray[$content_tag_name] .= $e->getMessage();
        }
            break;
        default:
            //$LabelArray[$content_tag_name]=curl_request($url, array('wenzhang'=>$LabelArray[$content_tag_name] ));
    }
    
    echo serialize($LabelArray);
    
    
    
    function compose_article($title, $content) {
        $separator = compose_separator();
        return $title.$separator.$content;
    }
    
    function compose_separator() {
        return PHP_EOL.'('.TITLE_SEPAR2.')'.PHP_EOL;
    }
    
    
    function fix_separator($article) {
        return $article;
    }
    
    
    function get_wyc_article($str, $realtime_keyword) {
        global $url;
        $separator = compose_separator();
        $separator = str_replace(PHP_EOL, '', $separator);
        $wyc = curl_request($url, array('wenzhang'=>$str, 'keywords'=>$realtime_keyword));
    
        $wyc_f = $wyc;
        $wyc = fix_separator($wyc);
        $wyc = explode($separator, $wyc);
    
        if (isset($wyc[0])){
            $wyc[0] = str_replace('标题:', '', $wyc[0]);
            $wyc[0] = str_replace('标题:', '', $wyc[0]);
            $wyc[0] = str_replace('目:', '', $wyc[0]);
            $wyc[0] = str_replace('目:', '', $wyc[0]);
            $wyc[0] = 'xx`xx'.$wyc[0];
            $wyc[0] = str_replace('xx`xx题', '', $wyc[0]);
            $wyc[0] = str_replace('xx`xx', '', $wyc[0]);
        }
    
        //if (isset($wyc[1])) $wyc[1] = trim($wyc[1]);
        //$wyc[1] = $wyc_f.'jjjjjjjj'.$wyc[1];
        return $wyc;
    }
    
    
    function get_wyc_title($str) {
        $title = get_wyc_article($str.PHP_EOL.PHP_EOL.PHP_EOL.$str.PHP_EOL.PHP_EOL.PHP_EOL.$str);
        $title = fix_newline($title);
        $title = explode(PHP_EOL, $title);
        return $title[0];
    }
    
    function get_keywords($title, $contents) {
        $url_kw = 'http://api-2.78tp.com/nlp/kws.php?appid=';
        $kws = curl_request($url_kw, array(
        'title'=>$title,
        'len'=>100,
        'text'=>$contents));
    
        return $kws;
    }
    
    
    function remove_alt($contents) {
        $contents = preg_replace('/alt="(.*)"/', '', $contents);
        return $contents;
    }
    
    
    function fix_title($contents) {
        $punctuation_symbol = array('。', '?', ',', ':', ';', '、', '!',
                                    '.',  '?',  ',',  ':',  ';', '!');
    
         $contents = str_replace($punctuation_symbol, '', $contents);
        return $contents;
    }
    
    function br2newline($contents) {
        $contents = str_replace('<br>', PHP_EOL, $contents);
        $contents = str_replace('<br/>', PHP_EOL, $contents);
        $contents = str_replace('<br />', PHP_EOL, $contents);
        $contents = str_replace('<BR/>', PHP_EOL, $contents);
        $contents = str_replace('<BR>', PHP_EOL, $contents);
        $contents = str_replace('<BR />', PHP_EOL, $contents);
    
        return $contents;
    }
    
    function newline2br($contnets) {
        $contnets = str_replace(PHP_EOL, "<br>", $contnets);
    //    $contnets = str_replace('><br><', '><', $contnets);
        $contnets = str_replace('<p><br>', '<p>', $contnets);
        return $contnets;
    }
    
    
    function delete_newline($contents) {
        $contents = fix_newline($contents);
    //    $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents);
    //    $contents = str_replace('>'.PHP_EOL, '>', $contents);
        return $contents;
    }
    
    function reset_newline_win($contents) {
        // 优化换行符
        $contents = str_replace("
    ", "
    ", $contents);
        $contents = str_replace("
    ", "
    ", $contents);
        $contents = str_replace("
    ", PHP_EOL, $contents);
    
        return $contents;
    }
    
    function fix_newline($data) {
        $data = str_replace("
    ", "
    ", $data);
        while(strpos($data, "
    
    ") !== false) {
            $data = str_replace("
    
    ", "
    ", $data);
        }
        $data = str_replace("
    ", PHP_EOL, $data);
    
        return $data;
    }
    
    function clean_contents($contents) {
    //    $str = preg_replace('#<([^>s/]+)[^>]*>#','<$1>', $contents);
    //    return $str;
        $sa = new cleanHtml;  
        $sa->allow = array( 'src' );    
        $sa->exceptions = array(  
        'img' => array( 'src', 'alt' ),  
        //'a' => array( 'href', 'title' ),  
        'iframe'=>array('src','frameborder'),  
        ); 
        $str = $sa->strip( $contents );   
    
        return $str;
    }
    
    
    function xfm_strong_str_replace_once($search, $replace, $subject) {
        $firstChar = strpos($subject, $search);
        if($firstChar !== false) {
            $beforeStr = substr($subject,0,$firstChar);
            $afterStr = substr($subject, $firstChar + strlen($search));
            return $beforeStr.$replace.$afterStr;
        } else {
            return $subject;
        }
    }
    
    //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies
    function curl_request($url,$post='',$cookie='', $returnCookie=0){
        if (! extension_loaded('curl')) {
            file_exists('./ext/php_curl.dll') && dl('php_curl.dll'); // 加载扩展
        }
        
            $curl = curl_init();
            curl_setopt($curl, CURLOPT_URL, $url);
            curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');
        if (ini_get('open_basedir') == '' && strtolower(ini_get('safe_mode')) != 'on'){ 
            curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
        }
            curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
            curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
            if($post) {
                curl_setopt($curl, CURLOPT_POST, 1);
                curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
            }
            if($cookie) {
                curl_setopt($curl, CURLOPT_COOKIE, $cookie);
            }
            curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
            curl_setopt($curl, CURLOPT_TIMEOUT, 150);
            curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
            $data = curl_exec($curl);
            if (curl_errno($curl)) {
                return curl_error($curl);
            }
            curl_close($curl);
            if($returnCookie){
                list($header, $body) = explode("
    
    ", $data, 2);
                preg_match_all("/Set-Cookie:([^;]*);/", $header, $matches);
                $info['cookie']  = substr($matches[1][0], 1);
                $info['content'] = $body;
                return $info;
            }else{
                return $data;
            }
    }
    
    //echo $tag;
    // 计算中文字符串长度
    function utf8_strlen($string = null) {
    // 将字符串分解为单元
    preg_match_all("/./us", $string, $match);
    // 返回单元个数
    return count($match[0]);
    }
    
    
    function reg_escape( $str )  
    {  
        $conversions = array( "^" => "^", "[" => "[", "." => ".", "$" => "$", "{" => "{", "*" => "*", "(" => "(", "\" => "\\", "/" => "/", "+" => "+", ")" => ")", "|" => "|", "?" => "?", "<" => "<", ">" => ">" );  
        return strtr( $str, $conversions );  
    }  
      
    /** 
    * Strip attribute Class 
    * Remove attributes from XML elements 
    * @author David (semlabs.co.uk) 
    * @version 0.2.1 
    */  
      
    class cleanHtml{  
          
        public $str         = '';  
        public $allow       = array();  
        public $exceptions  = array();  
        public $ignore      = array();  
          
        public function strip( $str )  
        {  
            $this->str = $str;  
              
            if( is_string( $str ) && strlen( $str ) > 0 )  
            {  
                $res = $this->findElements();  
                if( is_string( $res ) )  
                    return $res;  
                $nodes = $this->findAttributes( $res );  
                $this->removeAttributes( $nodes );  
            }  
              
            return $this->str;  
        }  
          
        private function findElements()  
        {  
            # Create an array of elements with attributes  
            $nodes = array();  
            preg_match_all( "/<([^ !/>
    ]+)([^>]*)>/i", $this->str, $elements );  
            foreach( $elements[1] as $el_key => $element )  
            {  
                if( $elements[2][$el_key] )  
                {  
                    $literal = $elements[0][$el_key];  
                    $element_name = $elements[1][$el_key];  
                    $attributes = $elements[2][$el_key];  
                    if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )  
                        $nodes[] = array( 'literal' => $literal, 'name' => $element_name, 'attributes' => $attributes );  
                }  
            }  
              
            # Return the XML if there were no attributes to remove  
            if( !$nodes[0] )  
                return $this->str;  
            else  
                return $nodes;  
        }  
          
        private function findAttributes( $nodes )  
        {  
              
            # Extract attributes  
            foreach( $nodes as &$node )  
            {  
                preg_match_all( "/([^ =]+)s*=s*["|']{0,1}([^"']*)["|']{0,1}/i", $node['attributes'], $attributes );  
                if( $attributes[1] )  
                {  
                    foreach( $attributes[1] as $att_key => $att )  
                    {  
                        $literal = $attributes[0][$att_key];  
                        $attribute_name = $attributes[1][$att_key];  
                        $value = $attributes[2][$att_key];  
                        $atts[] = array( 'literal' => $literal, 'name' => $attribute_name, 'value' => $value );  
                    }  
                }  
                else  
                    $node['attributes'] = null;  
                  
                $node['attributes'] = $atts;  
                unset( $atts );  
            }  
              
            return $nodes;  
        }  
          
        private function removeAttributes( $nodes )  
        {  
              
            # Remove unwanted attributes  
            foreach( $nodes as $node )  
            {  
                  
                # Check if node has any attributes to be kept  
                $node_name = $node['name'];  
                $new_attributes = '';  
                if( is_array( $node['attributes'] ) )  
                {  
                    foreach( $node['attributes'] as $attribute )  
                    {  
                        if( ( is_array( $this->allow ) && in_array( $attribute['name'], $this->allow ) ) || $this->isException( $node_name, $attribute['name'], $this->exceptions ) )  
                            $new_attributes = $this->createAttributes( $new_attributes, $attribute['name'], $attribute['value'] );  
                    }  
                }  
                $replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>";  
                $this->str = preg_replace( '/'. reg_escape( $node['literal'] ) .'/', $replacement, $this->str );  
            }  
              
        }  
          
        private function isException( $element_name, $attribute_name, $exceptions )  
        {  
            if( array_key_exists($element_name, $this->exceptions) )  
            {  
                if( in_array( $attribute_name, $this->exceptions[$element_name] ) )  
                    return true;  
            }  
              
            return false;  
        }  
          
        private function createAttributes( $new_attributes, $name, $value )  
        {  
            if( $new_attributes )  
                $new_attributes .= " ";  
            $new_attributes .= "$name="$value"";  
              
            return $new_attributes;  
        }  
      
    }  
    
    ?>
    

    我的其他文章:

    1、火车头采集伪原创插件PHP版实现

    2、火车头伪原创插件使用教程

    3、火车头如何调用百度NLP摘要,这里给大家一个PHP示例

    4、自然语言处理技术开发的伪原创工具

    5、火车头采集标题如何伪原创(附教程)

  • 相关阅读:
    poj 3087 直接模拟
    POJ-3126 BFS,埃式筛选及黑科技
    POJ3278-Catch That Cow
    js变量提升
    饿了么
    2分钟就能学会的【Google/百度搜索大法】了解一下?
    span标签间距
    Vue移动端项目如何使用手机预览调试
    Port 3000 is already in use
    koa2第一天 async详解
  • 原文地址:https://www.cnblogs.com/python168/p/13942599.html
Copyright © 2011-2022 走看看