uiharu/src/Lookup/WebLookup.php

210 lines
6.8 KiB
PHP

<?php
namespace Uiharu\Lookup;
use stdClass;
use DOMDocument;
use RuntimeException;
use Uiharu\Config;
use Uiharu\FFMPEG;
use Uiharu\MediaTypeExts;
use Uiharu\Url;
use Index\MediaType;
// TODO: Content-Disposition should be honoured for the filename (title).
final class WebLookup implements \Uiharu\ILookup {
public function match(Url $url): bool {
return $url->isWeb();
}
private static function reqCreate(string $url) {
$curl = curl_init($url);
curl_setopt_array($curl, [
CURLOPT_AUTOREFERER => true,
CURLOPT_CERTINFO => false,
CURLOPT_FAILONERROR => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_PATH_AS_IS => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TCP_FASTOPEN => true,
CURLOPT_CONNECTTIMEOUT => 2,
CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
CURLOPT_REDIR_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS,
CURLOPT_TIMEOUT => 5,
CURLOPT_DEFAULT_PROTOCOL => 'https',
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml',
],
]);
return $curl;
}
private static function reqHead($curl): array|null {
curl_setopt_array($curl, [
CURLOPT_NOBODY => true,
CURLOPT_HEADER => true,
]);
$headers = curl_exec($curl);
if($headers === false)
return null;
$headers = explode("\r\n", trim($headers));
$status = 200;
$lines = [];
foreach($headers as $header) {
if(empty($header))
continue;
if(strpos($header, ':') === false) {
$headParts = explode(' ', $header);
if(isset($headParts[1]) && is_numeric($headParts[1]))
$status = (int)$headParts[1];
$lines = [];
continue;
}
$parts = explode(':', $header, 2);
$parts[0] = mb_strtolower($parts[0]);
if(isset($lines[$parts[0]]))
$lines[$parts[0]] .= ', ' . trim($parts[1] ?? '');
else
$lines[$parts[0]] = trim($parts[1] ?? '');
}
return compact('status', 'lines');
}
private static function reqBody($curl): string|false {
curl_setopt_array($curl, [
CURLOPT_NOBODY => false,
CURLOPT_HEADER => false,
]);
return curl_exec($curl);
}
private static function reqError($curl): string {
return curl_error($curl);
}
private static function reqClose($curl): void {
curl_close($curl);
}
public function lookup(Url $url): WebLookupResult {
$req = self::reqCreate($url);
$head = self::reqHead($req);
if($head === null)
throw new RuntimeException('Web request timed out: ' . self::reqError($req));
try {
$mediaType = MediaType::parse($head['lines']['content-type'] ?? '');
} catch(InvalidArgumentException $ex) {
$mediaType = MediaType::parse('application/octet-stream');
}
$isXHTML = $mediaType->equals('application/xhtml+xml');
if($isXHTML || $mediaType->equals('text/html'))
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
self::reqClose($req);
if(MediaTypeExts::isMedia($mediaType))
return $this->lookupMedia($url, $mediaType);
return new WebLookupFallbackResult($url, $mediaType, $url->getHost() . ': ' . basename($url->getPath()));
}
private function lookupSite(Url $url, $req, MediaType $mediaType, bool $isXHTML): WebLookupResult {
$body = self::reqBody($req);
self::reqClose($req);
$document = new DOMDocument;
if($isXHTML) {
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
} else {
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
foreach($document->childNodes as $child)
if($child->nodeType === XML_PI_NODE) {
$document->removeChild($child);
break;
}
$document->encoding = $mediaType->getCharset();
}
$charSet = $document->encoding;
$siteInfo = new stdClass;
$siteInfo->title = '';
$siteInfo->metaTitle = '';
$siteInfo->desc = '';
$siteInfo->siteName = '';
$siteInfo->image = '';
$siteInfo->colour = '';
$siteInfo->type = 'website';
$titleTag = $document->getElementsByTagName('title');
foreach($titleTag as $tag) {
$siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
break;
}
$metaTags = $document->getElementsByTagName('meta');
foreach($metaTags as $tag) {
$nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : (
$tag->hasAttribute('property') ? $tag->getAttribute('property') : ''
);
$valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : (
$tag->hasAttribute('content') ? $tag->getAttribute('content') : ''
);
$nameAttr = trim(mb_convert_encoding($nameAttr, 'utf-8', $charSet));
$valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet));
if(empty($nameAttr) || empty($valueAttr))
continue;
switch($nameAttr) {
case 'og:title':
case 'twitter:title':
$siteInfo->metaTitle = $valueAttr;
break;
case 'description':
case 'og:description':
case 'twitter:description':
if(empty($siteInfo->desc))
$siteInfo->desc = $valueAttr;
break;
case 'og:site_name':
$siteInfo->siteName = $valueAttr;
break;
case 'og:image':
case 'twitter:image':
$siteInfo->image = $valueAttr;
break;
case 'theme-color':
$siteInfo->colour = $valueAttr;
break;
case 'og:type':
$siteInfo->type = 'website:' . $valueAttr;
break;
}
}
return new WebLookupSiteResult($url, $mediaType, $siteInfo);
}
private function lookupMedia(Url $url, MediaType $mediaType): WebLookupResult {
$mediaInfo = FFMPEG::cleanProbe($url);
return new WebLookupMediaResult($url, $mediaType, $mediaInfo);
}
}