220 lines
7.6 KiB
PHP
220 lines
7.6 KiB
PHP
<?php
|
|
namespace Uiharu\Lookup;
|
|
|
|
use stdClass;
|
|
use DOMDocument;
|
|
use RuntimeException;
|
|
use Uiharu\Config;
|
|
use Uiharu\FFMPEG;
|
|
use Uiharu\MediaTypeExts;
|
|
use Uiharu\Url;
|
|
use Index\MediaType;
|
|
|
|
// TODO: Content-Disposition should be honoured for the filename (title).
|
|
final class WebLookup implements \Uiharu\ILookup {
|
|
public function match(Url $url): bool {
|
|
return $url->isWeb();
|
|
}
|
|
|
|
private static function reqCreate(string $url) {
|
|
$curl = curl_init($url);
|
|
curl_setopt_array($curl, [
|
|
CURLOPT_AUTOREFERER => true,
|
|
CURLOPT_CERTINFO => false,
|
|
CURLOPT_FAILONERROR => false,
|
|
CURLOPT_FOLLOWLOCATION => true,
|
|
CURLOPT_MAXREDIRS => 5,
|
|
CURLOPT_PATH_AS_IS => true,
|
|
CURLOPT_RETURNTRANSFER => true,
|
|
CURLOPT_TCP_FASTOPEN => true,
|
|
CURLOPT_CONNECTTIMEOUT => 2,
|
|
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
|
|
CURLOPT_REDIR_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
|
|
CURLOPT_TIMEOUT => 5,
|
|
CURLOPT_DEFAULT_PROTOCOL => 'https',
|
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
|
|
CURLOPT_HTTPHEADER => [
|
|
'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
|
|
],
|
|
]);
|
|
return $curl;
|
|
}
|
|
|
|
private static function reqHead($curl): array|null {
|
|
curl_setopt_array($curl, [
|
|
CURLOPT_NOBODY => true,
|
|
CURLOPT_HEADER => true,
|
|
]);
|
|
|
|
$headers = curl_exec($curl);
|
|
if($headers === false)
|
|
return null;
|
|
|
|
$headers = explode("\r\n", trim($headers));
|
|
$status = 200;
|
|
$lines = [];
|
|
|
|
foreach($headers as $header) {
|
|
if(empty($header))
|
|
continue;
|
|
|
|
if(strpos($header, ':') === false) {
|
|
$headParts = explode(' ', $header);
|
|
if(isset($headParts[1]) && is_numeric($headParts[1]))
|
|
$status = (int)$headParts[1];
|
|
$lines = [];
|
|
continue;
|
|
}
|
|
|
|
$parts = explode(':', $header, 2);
|
|
$parts[0] = mb_strtolower($parts[0]);
|
|
if(isset($lines[$parts[0]]))
|
|
$lines[$parts[0]] .= ', ' . trim($parts[1] ?? '');
|
|
else
|
|
$lines[$parts[0]] = trim($parts[1] ?? '');
|
|
}
|
|
|
|
return compact('status', 'lines');
|
|
}
|
|
|
|
private static function reqBody($curl): string|false {
|
|
curl_setopt_array($curl, [
|
|
CURLOPT_NOBODY => false,
|
|
CURLOPT_HEADER => false,
|
|
]);
|
|
|
|
return curl_exec($curl);
|
|
}
|
|
|
|
private static function reqError($curl): string {
|
|
return curl_error($curl);
|
|
}
|
|
|
|
private static function reqClose($curl): void {
|
|
curl_close($curl);
|
|
}
|
|
|
|
public function lookup(Url $url): WebLookupResult {
|
|
$req = self::reqCreate($url);
|
|
$head = self::reqHead($req);
|
|
|
|
if($head === null)
|
|
throw new RuntimeException('Web request timed out: ' . self::reqError($req));
|
|
|
|
try {
|
|
$mediaType = MediaType::parse($head['lines']['content-type'] ?? '');
|
|
} catch(InvalidArgumentException $ex) {
|
|
$mediaType = MediaType::parse('application/octet-stream');
|
|
}
|
|
|
|
$isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml');
|
|
if($isXHTML || $mediaType->equals('text/html'))
|
|
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
|
|
|
|
self::reqClose($req);
|
|
|
|
if(MediaTypeExts::isMedia($mediaType))
|
|
return $this->lookupMedia($url, $mediaType);
|
|
|
|
return new WebLookupFallbackResult($url, $mediaType, $url->getHost() . ': ' . basename($url->getPath()));
|
|
}
|
|
|
|
private function lookupSite(Url $url, $req, MediaType $mediaType, bool $isXHTML): WebLookupResult {
|
|
$body = self::reqBody($req);
|
|
self::reqClose($req);
|
|
|
|
$charSet = $mediaType->getCharset();
|
|
|
|
$document = new DOMDocument;
|
|
if($isXHTML) {
|
|
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
|
} else {
|
|
$document->encoding = $charSet;
|
|
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
|
|
foreach($document->childNodes as $child)
|
|
if($child->nodeType === XML_PI_NODE) {
|
|
$document->removeChild($child);
|
|
break;
|
|
}
|
|
}
|
|
|
|
$siteInfo = new stdClass;
|
|
$siteInfo->title = '';
|
|
$siteInfo->metaTitle = '';
|
|
$siteInfo->desc = '';
|
|
$siteInfo->siteName = '';
|
|
$siteInfo->image = '';
|
|
$siteInfo->colour = '';
|
|
$siteInfo->type = 'website';
|
|
|
|
$titleTag = $document->getElementsByTagName('title');
|
|
foreach($titleTag as $tag) {
|
|
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
|
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8')
|
|
$content = $decoded;
|
|
$siteInfo->title = $content;
|
|
break;
|
|
}
|
|
|
|
$metaTags = $document->getElementsByTagName('meta');
|
|
foreach($metaTags as $tag) {
|
|
$nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : (
|
|
$tag->hasAttribute('property') ? $tag->getAttribute('property') : ''
|
|
);
|
|
$valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : (
|
|
$tag->hasAttribute('content') ? $tag->getAttribute('content') : ''
|
|
);
|
|
$nameAttr = trim(mb_convert_encoding($nameAttr, 'utf-8', $charSet));
|
|
$valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet));
|
|
if(empty($nameAttr) || empty($valueAttr))
|
|
continue;
|
|
|
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8')
|
|
$nameAttr = $decoded;
|
|
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8')
|
|
$valueAttr = $decoded;
|
|
|
|
switch($nameAttr) {
|
|
case 'og:title':
|
|
case 'twitter:title':
|
|
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
|
|
$siteInfo->metaTitle = $valueAttr;
|
|
break;
|
|
|
|
case 'description':
|
|
case 'og:description':
|
|
case 'twitter:description':
|
|
if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
|
|
$siteInfo->desc = $valueAttr;
|
|
break;
|
|
|
|
case 'og:site_name':
|
|
if(empty($siteInfo->siteName))
|
|
$siteInfo->siteName = $valueAttr;
|
|
break;
|
|
|
|
case 'og:image':
|
|
case 'twitter:image':
|
|
if(empty($siteInfo->image))
|
|
$siteInfo->image = $valueAttr;
|
|
break;
|
|
|
|
case 'theme-color':
|
|
$siteInfo->colour = $valueAttr;
|
|
break;
|
|
|
|
case 'og:type':
|
|
$siteInfo->type = 'website:' . $valueAttr;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return new WebLookupSiteResult($url, $mediaType, $siteInfo);
|
|
}
|
|
|
|
private function lookupMedia(Url $url, MediaType $mediaType): WebLookupResult {
|
|
$mediaInfo = FFMPEG::cleanProbe($url);
|
|
return new WebLookupMediaResult($url, $mediaType, $mediaInfo);
|
|
}
|
|
}
|