Fixed issues caused by websites lying about their character set.

This commit is contained in:
flash 2023-10-21 17:33:42 +00:00
parent 1f256a40ca
commit 842e91a413

View file

@ -125,6 +125,15 @@ final class WebLookup implements \Uiharu\ILookup {
self::reqClose($req); self::reqClose($req);
$charSet = $mediaType->getCharset(); $charSet = $mediaType->getCharset();
$charSetWrangle = function(string $input) use ($charSet): string {
if(strtoupper($charSet) === 'UTF-8') {
$decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8');
if(mb_check_encoding($decoded, 'UTF-8'))
return $decoded;
}
return $input;
};
$document = new DOMDocument; $document = new DOMDocument;
if($isXHTML) { if($isXHTML) {
@ -150,8 +159,7 @@ final class WebLookup implements \Uiharu\ILookup {
$titleTag = $document->getElementsByTagName('title'); $titleTag = $document->getElementsByTagName('title');
foreach($titleTag as $tag) { foreach($titleTag as $tag) {
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); $siteInfo->title = $charSetWrangle(trim($tag->textContent));
$siteInfo->title = $content;
break; break;
} }
@ -168,6 +176,9 @@ final class WebLookup implements \Uiharu\ILookup {
if(empty($nameAttr) || empty($valueAttr)) if(empty($nameAttr) || empty($valueAttr))
continue; continue;
$nameAttr = $charSetWrangle($nameAttr);
$valueAttr = $charSetWrangle($valueAttr);
switch($nameAttr) { switch($nameAttr) {
case 'og:title': case 'og:title':
case 'twitter:title': case 'twitter:title':