From 842e91a4139638cccec71d4c029f7bc027901a89 Mon Sep 17 00:00:00 2001 From: flashwave Date: Sat, 21 Oct 2023 17:33:42 +0000 Subject: [PATCH] Fixed issues caused by websites lying about their character set. --- src/Lookup/WebLookup.php | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Lookup/WebLookup.php b/src/Lookup/WebLookup.php index b5d44ff..a6fe541 100644 --- a/src/Lookup/WebLookup.php +++ b/src/Lookup/WebLookup.php @@ -125,6 +125,15 @@ final class WebLookup implements \Uiharu\ILookup { self::reqClose($req); $charSet = $mediaType->getCharset(); + $charSetWrangle = function(string $input) use ($charSet): string { + if(strtoupper($charSet) === 'UTF-8') { + $decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8'); + if(mb_check_encoding($decoded, 'UTF-8')) + return $decoded; + } + + return $input; + }; $document = new DOMDocument; if($isXHTML) { @@ -150,8 +159,7 @@ final class WebLookup implements \Uiharu\ILookup { $titleTag = $document->getElementsByTagName('title'); foreach($titleTag as $tag) { - $content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); - $siteInfo->title = $content; + $siteInfo->title = $charSetWrangle(trim($tag->textContent)); break; } @@ -168,6 +176,9 @@ final class WebLookup implements \Uiharu\ILookup { if(empty($nameAttr) || empty($valueAttr)) continue; + $nameAttr = $charSetWrangle($nameAttr); + $valueAttr = $charSetWrangle($valueAttr); + switch($nameAttr) { case 'og:title': case 'twitter:title':