分享以前写的 packagist 镜像爬虫

crawl.php

<?php
use Qiniu\Auth;
use Qiniu\Storage\BucketManager;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use QXS\WorkerPool\ClosureWorker;
use QXS\WorkerPool\WorkerPool;
use ProgressBar\Manager as ProgressBarManager;

date_default_timezone_set("Asia/Shanghai");
set_time_limit(900);
ini_set('memory_limit', '1G');
putenv("GUZZLE_CURL_SELECT_TIMEOUT=600");

require_once __DIR__ . '/vendor/autoload.php';

if (file_exists(__DIR__ . '/config.php')) {
    $config = require __DIR__ . '/config.php';
} else {
    $config = require __DIR__ . '/config.default.php';
}

if(date('H') < 6){
    sleep(800);
    exit(0);
}

$providers = downloadProviders($config);
if(empty($providers)){
    sleep(15);
    exit(0);
}
downloadPackages($config, $providers);
$hash=buildPackages($config);
buildjson($hash,$config);
cleanrCdn($hash,$config);
exit(0);

function cleanrCdn($hash,$config){

    $exec='echo "/v2/tune/prefetch" |openssl dgst -binary -hmac "'.$config->secretKey.'" -sha1 |base64 | tr + - | tr / _';
    $token = trim(`$exec`);
    $exec = 'curl -X POST -H \'Authorization: QBox '.$config->accessKey.':'.$token.'\' http://fusion.qiniuapi.com/v2/tune/prefetch  -d \'{"urls":["https://packagist.composer-proxy.org/p/all$'.$hash.'.json"]}\' -H \'Content-Type: application/json\'';
    var_dump(`$exec`);

    $exec='echo "/v2/tune/refresh" |openssl dgst -binary -hmac "'.$config->secretKey.'" -sha1 |base64 | tr + - | tr / _';
    $token = trim(`$exec`);
    $exec = 'curl -X POST -H \'Authorization: QBox '.$config->accessKey.':'.$token.'\' http://fusion.qiniuapi.com/v2/tune/refresh -d \'{"urls":["https://packagist.composer-proxy.org/packages.json"]}\' -H \'Content-Type: application/json\'';
    var_dump(`$exec`);

}

function buildjson($hash,$config){

    $array=array();
    $array['packages']=array();
    $array['notify']="https://packagist.org/downloads/%package%";
    $array['notify-batch']="https://packagist.org/downloads/";
    $array['providers-url']="/p/%package%$%hash%.json";
    $array['search']="https://packagist.org/search.json?q=%query%&type=%type%";
    $array['provider-includes']=array('p/all$%hash%.json'=>array('sha256'=>$hash));
    $array['sync-time'] = date(DATE_W3C);
    file_put_contents($config->builddir.'packages.json',json_encode((object)$array));

}

function buildPackages($config){
    $cachedir = $config->cachedir;
    $packagesCache = $cachedir . 'packages.json';
    $packages=json_decode(file_get_contents($packagesCache),true);
    if (empty($packages['provider-includes'])) {
        throw new \RuntimeException('packages.json schema changed?');
    }

    $packageslist=array();
    foreach ($packages['provider-includes'] as $tpl => $version) {
        $fileurl = $cachedir . str_replace('%hash%', $version['sha256'], $tpl);
        $_pack=json_decode(file_get_contents($fileurl),true);
        $packageslist = array_merge($packageslist,$_pack['providers']);
    }

    //404

    if(file_exists($config->dbdir.'list.json')){
        echo "Optimize Packages....".PHP_EOL;
        $package = json_decode(file_get_contents($config->dbdir.'list.json'),true);
        $packages=array();
        foreach ($package['packageNames'] as $value) {
            if(isset($packageslist[$value])){
                $packages[$value]=$packageslist[$value];
            }
        }
    }
    $packageslist=$packages;
    unset($packages);
    echo "build Packagesjson....".PHP_EOL;
    $progressBar = new ProgressBarManager(0, count($packageslist));
    $progressBar->setFormat('build Packagesjson: %current%/%max% [%bar%] %percent%%');

    $wp=new WorkerPool();
    $wp->setWorkerPoolSize($config->maxPool)->create(new ClosureWorker(
        /**
          * @param mixed $input the input from the WorkerPool::run() Method
          * @param \QXS\WorkerPool\Semaphore $semaphore the semaphore to synchronize calls accross all workers
          * @param \ArrayObject $storage a persistent storage for the current child process
          */
        function($input, $semaphore, $storage) {
            $config = $input['config'];
            $key    = $input['key'];
            $value  = $input['value'];
            $builddir  = $config->builddir;
            $packages=array();
            $url = $config->cachedir.'p/'.$key.'$'.$value['sha256'].'.json';

            if(!file_exists($url)) return array();

            $array = json_decode(file_get_contents($url));
            $dl  = array();
            $keys = array();
            $dls = array();
            if(empty($array->packages)) return array();
            foreach ($array->packages as $_key => $_vvvv) {
                foreach($array->packages->$_key  as $k=> $version){
                    $dist = $array->packages->$_key->$k->dist;
                    if(!is_null($dist) && $dist->type == "zip"){
                    $dl[$dist->reference] = $dist->url;
                    $keys[$dist->reference]=$key.'/'.$dist->reference.'.'.$dist->type;
                    $dls[$key.'/'.$dist->reference.'.'.$dist->type] = $dist->url;
                    $array->packages->$_key->$k->dist->url = $config->dlUrl.$key.'/'.$dist->reference.'.'.$dist->type;
                    }
                }
            }

            $json = json_encode($array);
            $buildfile= $builddir.'p/'.$key.'.json';
            $hash =hash('sha256',$json);
            $buildfile2 =$builddir.'p/'.$key.'$'.$hash.'.json';
            $dlfile= $config->dldir.$key.'.json';

            if(!file_exists($buildfile2)){
                if (!is_dir(dirname($buildfile))) {
                    @mkdir(dirname($buildfile), 0777, true);
                }

                $oldcache = $builddir.'p/'.$key.'$*';
                if ($glob = glob($oldcache)) {
                    foreach ($glob as $old) {
                        @unlink($old);
                    }
                }
                file_put_contents($buildfile,$json);
                file_put_contents($buildfile2,$json);
                if (!is_dir(dirname($dlfile))) {
                    @mkdir(dirname($dlfile), 0777, true);
                }
                $old_dl=array();

                if(file_exists($dlfile)){
                    $old_dl = json_decode(file_get_contents($dlfile),true);
                    foreach ($keys as $key => $value) {
                        if(isset($old_dl[$key])){
                            unset($dls[$value]);
                        }
                    }

                }

                file_put_contents($dlfile,json_encode($dl));

                $dbpath= $config->dbdir.$key.'.txt';

                if (!is_dir(dirname($dbpath))) {
                    @mkdir(dirname($dbpath), 0777, true);
                }
                file_put_contents($dbpath,$hash.PHP_EOL,FILE_APPEND);
            }else{
                $dls = array();
            }
            return array('key'=>$key,'hash'=>$hash,'dls'=>$dls);
        }
    )
    );

    foreach ($packageslist as $key => $value) {
            $wp->run(array('key'=>$key,'value'=>$value,'config'=>$config));
            $progressBar->advance();
    }

    $wp->waitForAllWorkers(); // wait for all workers

    $packlkist = array();
    $client = new Swoole\Client(SWOOLE_SOCK_TCP);
    $client->connect('127.0.0.1', 9501);
    foreach($wp as $val) {
        if(empty($val['data'])) continue;
        $packlkist[$val['data']['key']]=$val['data']['hash'];
        foreach ($val['data']['dls'] as $key => $value) {
             $client->send(json_encode(array('key'=>$key,'url'=>$value)));
        }
    }
    ksort($packlkist);
    $packages=array();
    foreach ($packlkist as $key=>$val){
         $packages['providers'][$key]['sha256']=$val;
    }

    $json = json_encode((object)$packages);
    $hash = hash('sha256',$json);
    $oldcache = $config->builddir.'/p/all$*';
    if ($glob = glob($oldcache)) {
        foreach ($glob as $old) {
            @unlink($old);
        }
    }
    file_put_contents($config->builddir.'/p/all$'.$hash.'.json',$json);

    $dbpath= $config->dbdir.'all.txt';
    file_put_contents($dbpath,$hash.PHP_EOL,FILE_APPEND);
    return  $hash;
}

/**
 * packages.json & provider-xxx$xxx.json downloader
 */
function downloadProviders($config)
{
    $client=new Client(['base_uri' => $config->packagistUrl,'timeout'  =>60]);

    $cachedir = $config->cachedir;
    $packagesCache = $cachedir . 'packages.json';
    echo "Downloading Packages...".PHP_EOL;
    $response = $client->get('/packages.json');

    if (200 === $response->getStatusCode()) {
        $packages = json_decode($response->getBody());
        foreach (explode(' ', 'notify notify-batch search') as $k) {
            if (0 === strpos($packages->$k, '/')) {
                $packages->$k = 'https://packagist.org' . $packages->$k;
            }
        }
        file_put_contents($packagesCache , json_encode($packages));
    }else{
        return array();
    }

    echo "Downloading All Packages...".PHP_EOL;
    $response = $client->get('/packages/list.json');
    if (200 === $response->getStatusCode()) {
        file_put_contents($config->dbdir.'list.json' , $response->getBody());
    }

    if (empty($packages->{'provider-includes'})) {
        throw new \RuntimeException('packages.json schema changed?');
    }

    $providers = [];

    $numberOfProviders = count( (array)$packages->{'provider-includes'} );
    $progressBar = new ProgressBarManager(0, $numberOfProviders);
    $progressBar->setFormat('Downloading Providers: %current%/%max% [%bar%] %percent%%');

    foreach ($packages->{'provider-includes'} as $tpl => $version) {
        //$client=new Client(['base_uri' => $config->packagistUrl,'timeout'  =>60]);
        $fileurl = str_replace('%hash%', $version->sha256, $tpl);

        $cachename = $cachedir . $fileurl;

        if (!file_exists($cachename)){

            $response=$client->get($fileurl);
            if (200 === $response->getStatusCode()) {
                $oldcache = $cachedir . str_replace('%hash%.json', '*', $tpl);
                if ($glob = glob($oldcache)) {
                    foreach ($glob as $old) {
                        @unlink($old);
                    }
                }
                if (!file_exists(dirname($cachename))) {
                    mkdir(dirname($cachename), 0777, true);
                }
                file_put_contents($cachename, $response->getBody());
                $providers[] = $cachename;
            }
        }

        $progressBar->advance();
    }
    return $providers;
}

/**
 * composer.json downloader
 *
 */
function downloadPackages($config, $providers)
{
    $cachedir = $config->cachedir;
    $urls = [];

    foreach ($providers as $providerjson) {

        $cachefiles = array();
        $list = json_decode(file_get_contents($providerjson));
        if (!$list || empty($list->providers)) continue;

        foreach ($list->providers as $packageName => $provider) {
            $url = "$config->packagistUrl/p/$packageName\$$provider->sha256.json";
            $cachefile = $cachedir . str_replace("$config->packagistUrl/", '', $url);
            if (file_exists($cachefile)) continue;
            $urls[]=$url;
        }
    }
        $numberOfProviders = count($urls);
        if(0 === $numberOfProviders) return array();

        $progressBar = new ProgressBarManager(0, $numberOfProviders);
        $progressBar->setFormat('Downloading Package: %current%/%max% [%bar%] %percent%%');

        $client = new Client();
        $requests = function ($urls) use ($client) {
            foreach ($urls as $uri) {
                yield function() use ($client, $uri) {
                    return $client->getAsync($uri);
                };
            }
        };

        $pool = new Pool($client, $requests($urls), [
            'concurrency' => $config->maxConnections,
            'fulfilled'   => function ($response, $index) use ($progressBar,$urls,$config,$cachedir){

            $res=json_decode($response->getBody());
            $cachefile = $cachedir. str_replace("$config->packagistUrl/", '', $urls[$index]);
            $packageName = explode('$',str_replace("$config->packagistUrl/p/", '', $urls[$index]))[0];
            if ($glob = glob("{$cachedir}p/$packageName\$*")) {
                foreach ($glob as $old) {
                    @unlink($old);
                }
            }
            if (!file_exists(dirname($cachefile))) {
                mkdir(dirname($cachefile), 0777, true);
            }
            file_put_contents($cachefile, $response->getBody());
            //
            $progressBar->advance();

            },
            'rejected' => function ($reason, $index) use ($progressBar){

                $progressBar->advance();

            },
        ]);

        // 开始发送请求
        $promise = $pool->promise();
        $promise->wait();
}

Dl_Server.php

<?php
use Qiniu\Auth;
use Qiniu\Storage\UploadManager;

date_default_timezone_set("Asia/Shanghai");
ini_set('memory_limit', '1G');
require_once __DIR__ . '/vendor/autoload.php';

if (file_exists(__DIR__ . '/config.php')) {
    $config = require __DIR__ . '/config.php';
} else {
    $config = require __DIR__ . '/config.default.php';
}

class Server
{
     private $serv;
     private $logFile;
     private  $config;

    function __construct($config)
    {

        $this->config  = $config;
        $this->logFile = __DIR__.'/logs/sync-'.date('Y-m-d').'.txt';
        $this->serv = new Swoole\Server("127.0.0.1", 9501);
        $this->serv->set([
            'worker_num' => 2,   // 一般设置为服务器CPU数的1-4倍
            'task_worker_num' => 10,  // task进程的数量(一般任务都是同步阻塞的,可以设置为单进程单线程)
            'daemonize' => true,  // 以守护进程执行
            'task_ipc_mode' => 3,  // 使用unix socket通信,默认模式
            'log_file' => __DIR__.'/logs/queue.log' ,    // swoole日志

            // 数据包分发策略(dispatch_mode=1/3时,底层会屏蔽onConnect/onClose事件,
            // 原因是这2种模式下无法保证onConnect/onClose/onReceive的顺序,非请求响应式的服务器程序,请不要使用模式1或3)
            'dispatch_mode' => 3,        // 固定模式,根据连接的文件描述符分配worker。这样可以保证同一个连接发来的数据只会被同一个worker处理
            ]); //配置task进程的数量
        $this->serv->on('receive', array($this, 'onReceive'));
        $this->serv->on('task', array($this, 'onTask'));
        $this->serv->on('finish', array($this, 'onFinish'));

        $this->serv->start();
    }

    public function onReceive($serv, $fd, $from_id, $data)
    {

        $str  = PHP_EOL."=========== onReceive ============".PHP_EOL;
        $str .= "Get Message From Client {$fd}:{$data}".PHP_EOL;
        error_log($str, 3, $this->logFile);
        $serv->task( $data );
    }

    public function onTask($serv, $task_id, $from_id, $data)
    {

        $auth = new Auth($this->config->accessKey, $this->config->secretKey);
        $bucket="dl-composer";
        $token = $auth->uploadToken($bucket);
        $uploadMgr = new UploadManager();

        $array  = json_decode( $data , true );

        if($array['key'] == '' || $array['url'] ==''){
            return 'url is  null';
        }

        $curl_handle = curl_init();
        curl_setopt($curl_handle, CURLOPT_URL, $array['url']);
        curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 0);
        curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
        curl_setopt($curl_handle, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Composer-proxy Download Engine v1.0 https://www.composer-proxy.org');
        curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, TRUE);
        $Response = curl_exec($curl_handle);
        $error = curl_error($curl_handle);
        if($error){
            error_log('Curl error:'.$error, 3 , $this->logFile);
            return $error;
        }

        list($ret, $err) = $uploadMgr->put($token,$array['key'], $Response);
        $str    = "=========== onTask ============".PHP_EOL;
        $str   .= var_export($array, 1).PHP_EOL;
        $str   .="=====> fetch ".$array['url']." to bucket: $bucket  key: ".$array['key'].PHP_EOL;

        error_log($str, 3 , $this->logFile);
        if ($err !== null) {
            return var_export($err, 1).PHP_EOL;
        } else {
            return 'Success';
        }
    }

    public function onFinish($serv, $task_id, $data)
    {
        $str  = "=========== onFinish ============".PHP_EOL;
        $str .= "Task {$task_id} finish !".PHP_EOL;
        $str .= $data;
        error_log($str, 3, $this->logFile);
    }

}

new Server($config);

config.default.php

<?php
return (object)array(
    'cachedir' => __DIR__ . '/cache/',
    'builddir' => __DIR__ . '/build/',
    'dldir'    => __DIR__ . '/dl/',
    'dbdir'    => __DIR__ . '/db/',
    'dlUrl' => 'https://dl.composer-proxy.org/',
    'packagistUrl' => 'https://packagist.org',
    'maxConnections' => 200,
    'maxPool'        => 15,
    'accessKey'      => '--',
    'secretKey'      => '',
    'bucket'         =>  ''
);

clean_qiniu_old.php

<?php

use Qiniu\Auth;
use Qiniu\Storage\BucketManager;
set_time_limit(3600);
require_once __DIR__ . '/vendor/autoload.php';
ini_set('memory_limit', '1G');
if (file_exists(__DIR__ . '/config.php')) {
    $config = require __DIR__ . '/config.php';
} else {
    $config = require __DIR__ . '/config.default.php';
}

$secretKey = $config->secretKey;
$accessKey = $config->accessKey;
$auth = new Auth($accessKey, $secretKey);
$bucketMgr = new BucketManager($auth);
$filename=array();

$path = $config->dbdir;

foreach (glob($path.'*/*.txt') as $v) {
    $arr=file($v);
    $count = count($arr);
    if($count > 2){
        $key = str_replace(array($path,'.json'),'',$v);
        $array=array_slice($arr, 0,$count-2);
        foreach ($array as $k=> $value) {
            $filename[]='p/'.$key.'$'.trim($value).'.json';
            unset($arr[$k]);
        }
        file_put_contents($v,implode('',$arr).PHP_EOL);
    }
}
$arr=file($path.'all.txt');
$count = count($arr);
if($count > 2){
    $array=array_slice($arr, 0,$count-2);
    foreach ($array as $k => $value) {
        $filename[]='p/all$'.trim($value).'.json';
        unset($arr[$k]);
    }
    file_put_contents($path.'all.txt',implode('',$arr).PHP_EOL);
}

$bucket="packagist";

$arr = array_chunk($filename,500);
foreach ($arr as  $value) {
    $files = $bucketMgr->buildBatchDelete($bucket,$value);
    var_dump($bucketMgr->batch($files));
}

check_404.php

<?php
use QXS\WorkerPool\ClosureWorker;
use QXS\WorkerPool\WorkerPool;
use ProgressBar\Manager as ProgressBarManager;

date_default_timezone_set("Asia/Shanghai");
set_time_limit(7200);
ini_set('memory_limit', '1G');

require_once __DIR__ . '/vendor/autoload.php';

if (file_exists(__DIR__ . '/config.php')) {
    $config = require __DIR__ . '/config.php';
} else {
    $config = require __DIR__ . '/config.default.php';
}

function check_404($config){
    $path = $config->cachedir;
    $packagist=array();
    $pack_404 =array();
    foreach (glob($path.'p/*/*.json') as $v) {
        $arr=json_decode(file_get_contents($v),true);
        reset($arr);
        if(empty($arr['packages'])){
            $pack_404[]=explode('$',str_replace($config->cachedir.'p/','',$v))[0];
            continue;
        }
        $array =current(current($arr['packages']));
        $packagist[$array['name']]=str_replace('https///','https://',str_replace(array('git@',':'),array('https://','/'),rtrim($array['source']['url'],'.git')));
    }

    $progressBar = new ProgressBarManager(0, count($packagist));
    $progressBar->setFormat('Check  404 Project: %current%/%max% [%bar%] %percent%%');
    $wp=new WorkerPool();
    $wp->setWorkerPoolSize(20)->create(new ClosureWorker(
                    /**
                      * @param mixed $input the input from the WorkerPool::run() Method
                      * @param \QXS\WorkerPool\Semaphore $semaphore the semaphore to synchronize calls accross all workers
                      * @param \ArrayObject $storage a persistent storage for the current child process
                      */
                    function($input, $semaphore, $storage) {

                        if(200 != GetHttpStatusCode($input['url'])){
                            return  $input['key'];
                        }
                        return  false;
                    }
            )
    );

    foreach ($packagist as $key => $value) {
            $wp->run(array('key'=>$key,'url'=>$value));
            $progressBar->advance();
    }

    $wp->waitForAllWorkers();

    foreach($wp as $val) {
            if($val['data']){
                $pack_404[]=$val['data'];
            }
    }
    file_put_contents($config->dbdir.'404.json',json_encode($pack_404),LOCK_EX);
}

function GetHttpStatusCode($url){
     $curl = curl_init();
     curl_setopt($curl,CURLOPT_URL,$url);//获取内容url
     curl_setopt($curl,CURLOPT_USERAGENT, 'Composer-proxy Check Engine v1.0 https://www.composer-proxy.org');
     curl_setopt($curl,CURLOPT_HEADER,1);//获取http头信息
     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
     curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
     curl_setopt($curl,CURLOPT_NOBODY,1);//不返回html的body信息
     curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);//返回数据流,不直接输出
     curl_setopt($curl,CURLOPT_FOLLOWLOCATION, 1);
     curl_setopt($curl,CURLOPT_TIMEOUT,30); //超时时长,单位秒
     curl_exec($curl);
     $rtn= curl_getinfo($curl,CURLINFO_HTTP_CODE);
     curl_close($curl);
     return  $rtn;
 }

 check_404($config);

 exit;
《L04 微信小程序从零到发布》
从小程序个人账户申请开始,带你一步步进行开发一个微信小程序,直到提交微信控制台上线发布。
《G01 Go 实战入门》
从零开始带你一步步开发一个 Go 博客项目,让你在最短的时间内学会使用 Go 进行编码。项目结构很大程度上参考了 Laravel。
讨论数量: 0
(= ̄ω ̄=)··· 暂无内容!

讨论应以学习和精进为目的。请勿发布不友善或者负能量的内容,与人为善,比聪明更重要!