分享以前写的 packagist 镜像爬虫
crawl.php
<?php
use Qiniu\Auth;
use Qiniu\Storage\BucketManager;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use QXS\WorkerPool\ClosureWorker;
use QXS\WorkerPool\WorkerPool;
use ProgressBar\Manager as ProgressBarManager;
date_default_timezone_set("Asia/Shanghai");
set_time_limit(900);
ini_set('memory_limit', '1G');
putenv("GUZZLE_CURL_SELECT_TIMEOUT=600");
require_once __DIR__ . '/vendor/autoload.php';
if (file_exists(__DIR__ . '/config.php')) {
$config = require __DIR__ . '/config.php';
} else {
$config = require __DIR__ . '/config.default.php';
}
if(date('H') < 6){
sleep(800);
exit(0);
}
$providers = downloadProviders($config);
if(empty($providers)){
sleep(15);
exit(0);
}
downloadPackages($config, $providers);
$hash=buildPackages($config);
buildjson($hash,$config);
cleanrCdn($hash,$config);
exit(0);
function cleanrCdn($hash,$config){
$exec='echo "/v2/tune/prefetch" |openssl dgst -binary -hmac "'.$config->secretKey.'" -sha1 |base64 | tr + - | tr / _';
$token = trim(`$exec`);
$exec = 'curl -X POST -H \'Authorization: QBox '.$config->accessKey.':'.$token.'\' http://fusion.qiniuapi.com/v2/tune/prefetch -d \'{"urls":["https://packagist.composer-proxy.org/p/all$'.$hash.'.json"]}\' -H \'Content-Type: application/json\'';
var_dump(`$exec`);
$exec='echo "/v2/tune/refresh" |openssl dgst -binary -hmac "'.$config->secretKey.'" -sha1 |base64 | tr + - | tr / _';
$token = trim(`$exec`);
$exec = 'curl -X POST -H \'Authorization: QBox '.$config->accessKey.':'.$token.'\' http://fusion.qiniuapi.com/v2/tune/refresh -d \'{"urls":["https://packagist.composer-proxy.org/packages.json"]}\' -H \'Content-Type: application/json\'';
var_dump(`$exec`);
}
function buildjson($hash,$config){
$array=array();
$array['packages']=array();
$array['notify']="https://packagist.org/downloads/%package%";
$array['notify-batch']="https://packagist.org/downloads/";
$array['providers-url']="/p/%package%$%hash%.json";
$array['search']="https://packagist.org/search.json?q=%query%&type=%type%";
$array['provider-includes']=array('p/all$%hash%.json'=>array('sha256'=>$hash));
$array['sync-time'] = date(DATE_W3C);
file_put_contents($config->builddir.'packages.json',json_encode((object)$array));
}
function buildPackages($config){
$cachedir = $config->cachedir;
$packagesCache = $cachedir . 'packages.json';
$packages=json_decode(file_get_contents($packagesCache),true);
if (empty($packages['provider-includes'])) {
throw new \RuntimeException('packages.json schema changed?');
}
$packageslist=array();
foreach ($packages['provider-includes'] as $tpl => $version) {
$fileurl = $cachedir . str_replace('%hash%', $version['sha256'], $tpl);
$_pack=json_decode(file_get_contents($fileurl),true);
$packageslist = array_merge($packageslist,$_pack['providers']);
}
//404
if(file_exists($config->dbdir.'list.json')){
echo "Optimize Packages....".PHP_EOL;
$package = json_decode(file_get_contents($config->dbdir.'list.json'),true);
$packages=array();
foreach ($package['packageNames'] as $value) {
if(isset($packageslist[$value])){
$packages[$value]=$packageslist[$value];
}
}
}
$packageslist=$packages;
unset($packages);
echo "build Packagesjson....".PHP_EOL;
$progressBar = new ProgressBarManager(0, count($packageslist));
$progressBar->setFormat('build Packagesjson: %current%/%max% [%bar%] %percent%%');
$wp=new WorkerPool();
$wp->setWorkerPoolSize($config->maxPool)->create(new ClosureWorker(
/**
* @param mixed $input the input from the WorkerPool::run() Method
* @param \QXS\WorkerPool\Semaphore $semaphore the semaphore to synchronize calls accross all workers
* @param \ArrayObject $storage a persistent storage for the current child process
*/
function($input, $semaphore, $storage) {
$config = $input['config'];
$key = $input['key'];
$value = $input['value'];
$builddir = $config->builddir;
$packages=array();
$url = $config->cachedir.'p/'.$key.'$'.$value['sha256'].'.json';
if(!file_exists($url)) return array();
$array = json_decode(file_get_contents($url));
$dl = array();
$keys = array();
$dls = array();
if(empty($array->packages)) return array();
foreach ($array->packages as $_key => $_vvvv) {
foreach($array->packages->$_key as $k=> $version){
$dist = $array->packages->$_key->$k->dist;
if(!is_null($dist) && $dist->type == "zip"){
$dl[$dist->reference] = $dist->url;
$keys[$dist->reference]=$key.'/'.$dist->reference.'.'.$dist->type;
$dls[$key.'/'.$dist->reference.'.'.$dist->type] = $dist->url;
$array->packages->$_key->$k->dist->url = $config->dlUrl.$key.'/'.$dist->reference.'.'.$dist->type;
}
}
}
$json = json_encode($array);
$buildfile= $builddir.'p/'.$key.'.json';
$hash =hash('sha256',$json);
$buildfile2 =$builddir.'p/'.$key.'$'.$hash.'.json';
$dlfile= $config->dldir.$key.'.json';
if(!file_exists($buildfile2)){
if (!is_dir(dirname($buildfile))) {
@mkdir(dirname($buildfile), 0777, true);
}
$oldcache = $builddir.'p/'.$key.'$*';
if ($glob = glob($oldcache)) {
foreach ($glob as $old) {
@unlink($old);
}
}
file_put_contents($buildfile,$json);
file_put_contents($buildfile2,$json);
if (!is_dir(dirname($dlfile))) {
@mkdir(dirname($dlfile), 0777, true);
}
$old_dl=array();
if(file_exists($dlfile)){
$old_dl = json_decode(file_get_contents($dlfile),true);
foreach ($keys as $key => $value) {
if(isset($old_dl[$key])){
unset($dls[$value]);
}
}
}
file_put_contents($dlfile,json_encode($dl));
$dbpath= $config->dbdir.$key.'.txt';
if (!is_dir(dirname($dbpath))) {
@mkdir(dirname($dbpath), 0777, true);
}
file_put_contents($dbpath,$hash.PHP_EOL,FILE_APPEND);
}else{
$dls = array();
}
return array('key'=>$key,'hash'=>$hash,'dls'=>$dls);
}
)
);
foreach ($packageslist as $key => $value) {
$wp->run(array('key'=>$key,'value'=>$value,'config'=>$config));
$progressBar->advance();
}
$wp->waitForAllWorkers(); // wait for all workers
$packlkist = array();
$client = new Swoole\Client(SWOOLE_SOCK_TCP);
$client->connect('127.0.0.1', 9501);
foreach($wp as $val) {
if(empty($val['data'])) continue;
$packlkist[$val['data']['key']]=$val['data']['hash'];
foreach ($val['data']['dls'] as $key => $value) {
$client->send(json_encode(array('key'=>$key,'url'=>$value)));
}
}
ksort($packlkist);
$packages=array();
foreach ($packlkist as $key=>$val){
$packages['providers'][$key]['sha256']=$val;
}
$json = json_encode((object)$packages);
$hash = hash('sha256',$json);
$oldcache = $config->builddir.'/p/all$*';
if ($glob = glob($oldcache)) {
foreach ($glob as $old) {
@unlink($old);
}
}
file_put_contents($config->builddir.'/p/all$'.$hash.'.json',$json);
$dbpath= $config->dbdir.'all.txt';
file_put_contents($dbpath,$hash.PHP_EOL,FILE_APPEND);
return $hash;
}
/**
* packages.json & provider-xxx$xxx.json downloader
*/
function downloadProviders($config)
{
$client=new Client(['base_uri' => $config->packagistUrl,'timeout' =>60]);
$cachedir = $config->cachedir;
$packagesCache = $cachedir . 'packages.json';
echo "Downloading Packages...".PHP_EOL;
$response = $client->get('/packages.json');
if (200 === $response->getStatusCode()) {
$packages = json_decode($response->getBody());
foreach (explode(' ', 'notify notify-batch search') as $k) {
if (0 === strpos($packages->$k, '/')) {
$packages->$k = 'https://packagist.org' . $packages->$k;
}
}
file_put_contents($packagesCache , json_encode($packages));
}else{
return array();
}
echo "Downloading All Packages...".PHP_EOL;
$response = $client->get('/packages/list.json');
if (200 === $response->getStatusCode()) {
file_put_contents($config->dbdir.'list.json' , $response->getBody());
}
if (empty($packages->{'provider-includes'})) {
throw new \RuntimeException('packages.json schema changed?');
}
$providers = [];
$numberOfProviders = count( (array)$packages->{'provider-includes'} );
$progressBar = new ProgressBarManager(0, $numberOfProviders);
$progressBar->setFormat('Downloading Providers: %current%/%max% [%bar%] %percent%%');
foreach ($packages->{'provider-includes'} as $tpl => $version) {
//$client=new Client(['base_uri' => $config->packagistUrl,'timeout' =>60]);
$fileurl = str_replace('%hash%', $version->sha256, $tpl);
$cachename = $cachedir . $fileurl;
if (!file_exists($cachename)){
$response=$client->get($fileurl);
if (200 === $response->getStatusCode()) {
$oldcache = $cachedir . str_replace('%hash%.json', '*', $tpl);
if ($glob = glob($oldcache)) {
foreach ($glob as $old) {
@unlink($old);
}
}
if (!file_exists(dirname($cachename))) {
mkdir(dirname($cachename), 0777, true);
}
file_put_contents($cachename, $response->getBody());
$providers[] = $cachename;
}
}
$progressBar->advance();
}
return $providers;
}
/**
* composer.json downloader
*
*/
function downloadPackages($config, $providers)
{
$cachedir = $config->cachedir;
$urls = [];
foreach ($providers as $providerjson) {
$cachefiles = array();
$list = json_decode(file_get_contents($providerjson));
if (!$list || empty($list->providers)) continue;
foreach ($list->providers as $packageName => $provider) {
$url = "$config->packagistUrl/p/$packageName\$$provider->sha256.json";
$cachefile = $cachedir . str_replace("$config->packagistUrl/", '', $url);
if (file_exists($cachefile)) continue;
$urls[]=$url;
}
}
$numberOfProviders = count($urls);
if(0 === $numberOfProviders) return array();
$progressBar = new ProgressBarManager(0, $numberOfProviders);
$progressBar->setFormat('Downloading Package: %current%/%max% [%bar%] %percent%%');
$client = new Client();
$requests = function ($urls) use ($client) {
foreach ($urls as $uri) {
yield function() use ($client, $uri) {
return $client->getAsync($uri);
};
}
};
$pool = new Pool($client, $requests($urls), [
'concurrency' => $config->maxConnections,
'fulfilled' => function ($response, $index) use ($progressBar,$urls,$config,$cachedir){
$res=json_decode($response->getBody());
$cachefile = $cachedir. str_replace("$config->packagistUrl/", '', $urls[$index]);
$packageName = explode('$',str_replace("$config->packagistUrl/p/", '', $urls[$index]))[0];
if ($glob = glob("{$cachedir}p/$packageName\$*")) {
foreach ($glob as $old) {
@unlink($old);
}
}
if (!file_exists(dirname($cachefile))) {
mkdir(dirname($cachefile), 0777, true);
}
file_put_contents($cachefile, $response->getBody());
//
$progressBar->advance();
},
'rejected' => function ($reason, $index) use ($progressBar){
$progressBar->advance();
},
]);
// 开始发送请求
$promise = $pool->promise();
$promise->wait();
}
Dl_Server.php
<?php
use Qiniu\Auth;
use Qiniu\Storage\UploadManager;
date_default_timezone_set("Asia/Shanghai");
ini_set('memory_limit', '1G');
require_once __DIR__ . '/vendor/autoload.php';
if (file_exists(__DIR__ . '/config.php')) {
$config = require __DIR__ . '/config.php';
} else {
$config = require __DIR__ . '/config.default.php';
}
class Server
{
private $serv;
private $logFile;
private $config;
function __construct($config)
{
$this->config = $config;
$this->logFile = __DIR__.'/logs/sync-'.date('Y-m-d').'.txt';
$this->serv = new Swoole\Server("127.0.0.1", 9501);
$this->serv->set([
'worker_num' => 2, // 一般设置为服务器CPU数的1-4倍
'task_worker_num' => 10, // task进程的数量(一般任务都是同步阻塞的,可以设置为单进程单线程)
'daemonize' => true, // 以守护进程执行
'task_ipc_mode' => 3, // 使用unix socket通信,默认模式
'log_file' => __DIR__.'/logs/queue.log' , // swoole日志
// 数据包分发策略(dispatch_mode=1/3时,底层会屏蔽onConnect/onClose事件,
// 原因是这2种模式下无法保证onConnect/onClose/onReceive的顺序,非请求响应式的服务器程序,请不要使用模式1或3)
'dispatch_mode' => 3, // 固定模式,根据连接的文件描述符分配worker。这样可以保证同一个连接发来的数据只会被同一个worker处理
]); //配置task进程的数量
$this->serv->on('receive', array($this, 'onReceive'));
$this->serv->on('task', array($this, 'onTask'));
$this->serv->on('finish', array($this, 'onFinish'));
$this->serv->start();
}
public function onReceive($serv, $fd, $from_id, $data)
{
$str = PHP_EOL."=========== onReceive ============".PHP_EOL;
$str .= "Get Message From Client {$fd}:{$data}".PHP_EOL;
error_log($str, 3, $this->logFile);
$serv->task( $data );
}
public function onTask($serv, $task_id, $from_id, $data)
{
$auth = new Auth($this->config->accessKey, $this->config->secretKey);
$bucket="dl-composer";
$token = $auth->uploadToken($bucket);
$uploadMgr = new UploadManager();
$array = json_decode( $data , true );
if($array['key'] == '' || $array['url'] ==''){
return 'url is null';
}
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $array['url']);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 0);
curl_setopt($curl_handle, CURLOPT_TIMEOUT, 0);
curl_setopt($curl_handle, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Composer-proxy Download Engine v1.0 https://www.composer-proxy.org');
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, TRUE);
$Response = curl_exec($curl_handle);
$error = curl_error($curl_handle);
if($error){
error_log('Curl error:'.$error, 3 , $this->logFile);
return $error;
}
list($ret, $err) = $uploadMgr->put($token,$array['key'], $Response);
$str = "=========== onTask ============".PHP_EOL;
$str .= var_export($array, 1).PHP_EOL;
$str .="=====> fetch ".$array['url']." to bucket: $bucket key: ".$array['key'].PHP_EOL;
error_log($str, 3 , $this->logFile);
if ($err !== null) {
return var_export($err, 1).PHP_EOL;
} else {
return 'Success';
}
}
public function onFinish($serv, $task_id, $data)
{
$str = "=========== onFinish ============".PHP_EOL;
$str .= "Task {$task_id} finish !".PHP_EOL;
$str .= $data;
error_log($str, 3, $this->logFile);
}
}
new Server($config);
config.default.php
<?php
return (object)array(
'cachedir' => __DIR__ . '/cache/',
'builddir' => __DIR__ . '/build/',
'dldir' => __DIR__ . '/dl/',
'dbdir' => __DIR__ . '/db/',
'dlUrl' => 'https://dl.composer-proxy.org/',
'packagistUrl' => 'https://packagist.org',
'maxConnections' => 200,
'maxPool' => 15,
'accessKey' => '--',
'secretKey' => '',
'bucket' => ''
);
clean_qiniu_old.php
<?php
use Qiniu\Auth;
use Qiniu\Storage\BucketManager;
set_time_limit(3600);
require_once __DIR__ . '/vendor/autoload.php';
ini_set('memory_limit', '1G');
if (file_exists(__DIR__ . '/config.php')) {
$config = require __DIR__ . '/config.php';
} else {
$config = require __DIR__ . '/config.default.php';
}
$secretKey = $config->secretKey;
$accessKey = $config->accessKey;
$auth = new Auth($accessKey, $secretKey);
$bucketMgr = new BucketManager($auth);
$filename=array();
$path = $config->dbdir;
foreach (glob($path.'*/*.txt') as $v) {
$arr=file($v);
$count = count($arr);
if($count > 2){
$key = str_replace(array($path,'.json'),'',$v);
$array=array_slice($arr, 0,$count-2);
foreach ($array as $k=> $value) {
$filename[]='p/'.$key.'$'.trim($value).'.json';
unset($arr[$k]);
}
file_put_contents($v,implode('',$arr).PHP_EOL);
}
}
$arr=file($path.'all.txt');
$count = count($arr);
if($count > 2){
$array=array_slice($arr, 0,$count-2);
foreach ($array as $k => $value) {
$filename[]='p/all$'.trim($value).'.json';
unset($arr[$k]);
}
file_put_contents($path.'all.txt',implode('',$arr).PHP_EOL);
}
$bucket="packagist";
$arr = array_chunk($filename,500);
foreach ($arr as $value) {
$files = $bucketMgr->buildBatchDelete($bucket,$value);
var_dump($bucketMgr->batch($files));
}
check_404.php
<?php
use QXS\WorkerPool\ClosureWorker;
use QXS\WorkerPool\WorkerPool;
use ProgressBar\Manager as ProgressBarManager;
date_default_timezone_set("Asia/Shanghai");
set_time_limit(7200);
ini_set('memory_limit', '1G');
require_once __DIR__ . '/vendor/autoload.php';
if (file_exists(__DIR__ . '/config.php')) {
$config = require __DIR__ . '/config.php';
} else {
$config = require __DIR__ . '/config.default.php';
}
function check_404($config){
$path = $config->cachedir;
$packagist=array();
$pack_404 =array();
foreach (glob($path.'p/*/*.json') as $v) {
$arr=json_decode(file_get_contents($v),true);
reset($arr);
if(empty($arr['packages'])){
$pack_404[]=explode('$',str_replace($config->cachedir.'p/','',$v))[0];
continue;
}
$array =current(current($arr['packages']));
$packagist[$array['name']]=str_replace('https///','https://',str_replace(array('git@',':'),array('https://','/'),rtrim($array['source']['url'],'.git')));
}
$progressBar = new ProgressBarManager(0, count($packagist));
$progressBar->setFormat('Check 404 Project: %current%/%max% [%bar%] %percent%%');
$wp=new WorkerPool();
$wp->setWorkerPoolSize(20)->create(new ClosureWorker(
/**
* @param mixed $input the input from the WorkerPool::run() Method
* @param \QXS\WorkerPool\Semaphore $semaphore the semaphore to synchronize calls accross all workers
* @param \ArrayObject $storage a persistent storage for the current child process
*/
function($input, $semaphore, $storage) {
if(200 != GetHttpStatusCode($input['url'])){
return $input['key'];
}
return false;
}
)
);
foreach ($packagist as $key => $value) {
$wp->run(array('key'=>$key,'url'=>$value));
$progressBar->advance();
}
$wp->waitForAllWorkers();
foreach($wp as $val) {
if($val['data']){
$pack_404[]=$val['data'];
}
}
file_put_contents($config->dbdir.'404.json',json_encode($pack_404),LOCK_EX);
}
function GetHttpStatusCode($url){
$curl = curl_init();
curl_setopt($curl,CURLOPT_URL,$url);//获取内容url
curl_setopt($curl,CURLOPT_USERAGENT, 'Composer-proxy Check Engine v1.0 https://www.composer-proxy.org');
curl_setopt($curl,CURLOPT_HEADER,1);//获取http头信息
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($curl,CURLOPT_NOBODY,1);//不返回html的body信息
curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);//返回数据流,不直接输出
curl_setopt($curl,CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl,CURLOPT_TIMEOUT,30); //超时时长,单位秒
curl_exec($curl);
$rtn= curl_getinfo($curl,CURLINFO_HTTP_CODE);
curl_close($curl);
return $rtn;
}
check_404($config);
exit;
哥哥呀 你给这些代码放github上吧 :joy: