Fixed double encoding, probably.

This commit is contained in:
flash 2022-07-20 00:13:56 +00:00
parent d90927469f
commit 5c9b13073d

View file

@ -34,7 +34,7 @@ final class WebLookup implements \Uiharu\ILookup {
CURLOPT_DEFAULT_PROTOCOL => 'https', CURLOPT_DEFAULT_PROTOCOL => 'https',
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION, CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
CURLOPT_HTTPHEADER => [ CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml', 'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
], ],
]); ]);
return $curl; return $curl;
@ -107,7 +107,7 @@ final class WebLookup implements \Uiharu\ILookup {
$mediaType = MediaType::parse('application/octet-stream'); $mediaType = MediaType::parse('application/octet-stream');
} }
$isXHTML = $mediaType->equals('application/xhtml+xml'); $isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml');
if($isXHTML || $mediaType->equals('text/html')) if($isXHTML || $mediaType->equals('text/html'))
return $this->lookupSite($url, $req, $mediaType, $isXHTML); return $this->lookupSite($url, $req, $mediaType, $isXHTML);
@ -123,22 +123,21 @@ final class WebLookup implements \Uiharu\ILookup {
$body = self::reqBody($req); $body = self::reqBody($req);
self::reqClose($req); self::reqClose($req);
$charSet = $mediaType->getCharset();
$document = new DOMDocument; $document = new DOMDocument;
if($isXHTML) { if($isXHTML) {
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); $document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
} else { } else {
$document->encoding = $charSet;
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); $document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
foreach($document->childNodes as $child) foreach($document->childNodes as $child)
if($child->nodeType === XML_PI_NODE) { if($child->nodeType === XML_PI_NODE) {
$document->removeChild($child); $document->removeChild($child);
break; break;
} }
$document->encoding = $mediaType->getCharset();
} }
$charSet = $document->encoding;
$siteInfo = new stdClass; $siteInfo = new stdClass;
$siteInfo->title = ''; $siteInfo->title = '';
$siteInfo->metaTitle = ''; $siteInfo->metaTitle = '';
@ -150,7 +149,10 @@ final class WebLookup implements \Uiharu\ILookup {
$titleTag = $document->getElementsByTagName('title'); $titleTag = $document->getElementsByTagName('title');
foreach($titleTag as $tag) { foreach($titleTag as $tag) {
$siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet)); $content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8')
$content = $decoded;
$siteInfo->title = $content;
break; break;
} }
@ -167,25 +169,33 @@ final class WebLookup implements \Uiharu\ILookup {
if(empty($nameAttr) || empty($valueAttr)) if(empty($nameAttr) || empty($valueAttr))
continue; continue;
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8')
$nameAttr = $decoded;
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8')
$valueAttr = $decoded;
switch($nameAttr) { switch($nameAttr) {
case 'og:title': case 'og:title':
case 'twitter:title': case 'twitter:title':
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
$siteInfo->metaTitle = $valueAttr; $siteInfo->metaTitle = $valueAttr;
break; break;
case 'description': case 'description':
case 'og:description': case 'og:description':
case 'twitter:description': case 'twitter:description':
if(empty($siteInfo->desc)) if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
$siteInfo->desc = $valueAttr; $siteInfo->desc = $valueAttr;
break; break;
case 'og:site_name': case 'og:site_name':
if(empty($siteInfo->siteName))
$siteInfo->siteName = $valueAttr; $siteInfo->siteName = $valueAttr;
break; break;
case 'og:image': case 'og:image':
case 'twitter:image': case 'twitter:image':
if(empty($siteInfo->image))
$siteInfo->image = $valueAttr; $siteInfo->image = $valueAttr;
break; break;