From 5c9b13073d7581730cf4a08dcfb4da38c2a1e8c2 Mon Sep 17 00:00:00 2001 From: flashwave Date: Wed, 20 Jul 2022 00:13:56 +0000 Subject: [PATCH] Fixed double encoding, probably. --- src/Lookup/WebLookup.php | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/Lookup/WebLookup.php b/src/Lookup/WebLookup.php index 3c97338..dd3a999 100644 --- a/src/Lookup/WebLookup.php +++ b/src/Lookup/WebLookup.php @@ -34,7 +34,7 @@ final class WebLookup implements \Uiharu\ILookup { CURLOPT_DEFAULT_PROTOCOL => 'https', CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION, CURLOPT_HTTPHEADER => [ - 'Accept: text/html,application/xhtml+xml', + 'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8', ], ]); return $curl; @@ -107,7 +107,7 @@ final class WebLookup implements \Uiharu\ILookup { $mediaType = MediaType::parse('application/octet-stream'); } - $isXHTML = $mediaType->equals('application/xhtml+xml'); + $isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml'); if($isXHTML || $mediaType->equals('text/html')) return $this->lookupSite($url, $req, $mediaType, $isXHTML); @@ -123,22 +123,21 @@ final class WebLookup implements \Uiharu\ILookup { $body = self::reqBody($req); self::reqClose($req); + $charSet = $mediaType->getCharset(); + $document = new DOMDocument; if($isXHTML) { $document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); } else { + $document->encoding = $charSet; $document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); foreach($document->childNodes as $child) if($child->nodeType === XML_PI_NODE) { $document->removeChild($child); break; } - - $document->encoding = $mediaType->getCharset(); } - $charSet = $document->encoding; - $siteInfo = new stdClass; $siteInfo->title = ''; $siteInfo->metaTitle = ''; @@ -150,7 +149,10 @@ final class WebLookup implements \Uiharu\ILookup { $titleTag = $document->getElementsByTagName('title'); foreach($titleTag as $tag) { - $siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); + $content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); + if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8') + $content = $decoded; + $siteInfo->title = $content; break; } @@ -167,26 +169,34 @@ final class WebLookup implements \Uiharu\ILookup { if(empty($nameAttr) || empty($valueAttr)) continue; + if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8') + $nameAttr = $decoded; + if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8') + $valueAttr = $decoded; + switch($nameAttr) { case 'og:title': case 'twitter:title': - $siteInfo->metaTitle = $valueAttr; + if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle)) + $siteInfo->metaTitle = $valueAttr; break; case 'description': case 'og:description': case 'twitter:description': - if(empty($siteInfo->desc)) + if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc)) $siteInfo->desc = $valueAttr; break; case 'og:site_name': - $siteInfo->siteName = $valueAttr; + if(empty($siteInfo->siteName)) + $siteInfo->siteName = $valueAttr; break; case 'og:image': case 'twitter:image': - $siteInfo->image = $valueAttr; + if(empty($siteInfo->image)) + $siteInfo->image = $valueAttr; break; case 'theme-color':