isWeb(); } private static function reqCreate(string $url) { $curl = curl_init($url); curl_setopt_array($curl, [ CURLOPT_AUTOREFERER => true, CURLOPT_CERTINFO => false, CURLOPT_FAILONERROR => false, CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => 5, CURLOPT_PATH_AS_IS => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_TCP_FASTOPEN => true, CURLOPT_CONNECTTIMEOUT => 2, CURLOPT_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS, CURLOPT_REDIR_PROTOCOLS => CURLPROTO_HTTP | CURLPROTO_HTTPS, CURLOPT_TIMEOUT => 5, CURLOPT_DEFAULT_PROTOCOL => 'https', CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION, CURLOPT_HTTPHEADER => [ 'Accept: text/html,application/xhtml+xml', ], ]); return $curl; } private static function reqHead($curl): array|null { curl_setopt_array($curl, [ CURLOPT_NOBODY => true, CURLOPT_HEADER => true, ]); $headers = curl_exec($curl); if($headers === false) return null; $headers = explode("\r\n", trim($headers)); $status = 200; $lines = []; foreach($headers as $header) { if(empty($header)) continue; if(strpos($header, ':') === false) { $headParts = explode(' ', $header); if(isset($headParts[1]) && is_numeric($headParts[1])) $status = (int)$headParts[1]; $lines = []; continue; } $parts = explode(':', $header, 2); $parts[0] = mb_strtolower($parts[0]); if(isset($lines[$parts[0]])) $lines[$parts[0]] .= ', ' . trim($parts[1] ?? ''); else $lines[$parts[0]] = trim($parts[1] ?? ''); } return compact('status', 'lines'); } private static function reqBody($curl): string|false { curl_setopt_array($curl, [ CURLOPT_NOBODY => false, CURLOPT_HEADER => false, ]); return curl_exec($curl); } private static function reqError($curl): string { return curl_error($curl); } private static function reqClose($curl): void { curl_close($curl); } public function lookup(Url $url): WebLookupResult { $req = self::reqCreate($url); $head = self::reqHead($req); if($head === null) throw new RuntimeException('Web request timed out: ' . self::reqError($req)); try { $mediaType = MediaType::parse($head['lines']['content-type'] ?? ''); } catch(InvalidArgumentException $ex) { $mediaType = MediaType::parse('application/octet-stream'); } $isXHTML = $mediaType->equals('application/xhtml+xml'); if($isXHTML || $mediaType->equals('text/html')) return $this->lookupSite($url, $req, $mediaType, $isXHTML); self::reqClose($req); if(MediaTypeExts::isMedia($mediaType)) return $this->lookupMedia($url, $mediaType); return new WebLookupFallbackResult($url, $mediaType, $url->getHost() . ': ' . basename($url->getPath())); } private function lookupSite(Url $url, $req, MediaType $mediaType, bool $isXHTML): WebLookupResult { $body = self::reqBody($req); self::reqClose($req); $document = new DOMDocument; if($isXHTML) { $document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); } else { $document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); foreach($document->childNodes as $child) if($child->nodeType === XML_PI_NODE) { $document->removeChild($child); break; } $document->encoding = $mediaType->getCharset(); } $charSet = $document->encoding; $siteInfo = new stdClass; $siteInfo->title = ''; $siteInfo->metaTitle = ''; $siteInfo->desc = ''; $siteInfo->siteName = ''; $siteInfo->image = ''; $siteInfo->colour = ''; $siteInfo->type = 'website'; $titleTag = $document->getElementsByTagName('title'); foreach($titleTag as $tag) { $siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); break; } $metaTags = $document->getElementsByTagName('meta'); foreach($metaTags as $tag) { $nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : ( $tag->hasAttribute('property') ? $tag->getAttribute('property') : '' ); $valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : ( $tag->hasAttribute('content') ? $tag->getAttribute('content') : '' ); $nameAttr = trim(mb_convert_encoding($nameAttr, 'utf-8', $charSet)); $valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet)); if(empty($nameAttr) || empty($valueAttr)) continue; switch($nameAttr) { case 'og:title': case 'twitter:title': $siteInfo->metaTitle = $valueAttr; break; case 'description': case 'og:description': case 'twitter:description': if(empty($siteInfo->desc)) $siteInfo->desc = $valueAttr; break; case 'og:site_name': $siteInfo->siteName = $valueAttr; break; case 'og:image': case 'twitter:image': $siteInfo->image = $valueAttr; break; case 'theme-color': $siteInfo->colour = $valueAttr; break; case 'og:type': $siteInfo->type = 'website:' . $valueAttr; break; } } return new WebLookupSiteResult($url, $mediaType, $siteInfo); } private function lookupMedia(Url $url, MediaType $mediaType): WebLookupResult { $mediaInfo = FFMPEG::cleanProbe($url); return new WebLookupMediaResult($url, $mediaType, $mediaInfo); } }