Fixed double encoding, probably.

This commit is contained in:
flash 2022-07-20 00:13:56 +00:00
parent d90927469f
commit 5c9b13073d
1 changed files with 21 additions and 11 deletions

View File

@ -34,7 +34,7 @@ final class WebLookup implements \Uiharu\ILookup {
CURLOPT_DEFAULT_PROTOCOL => 'https',
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible) Uiharu/' . UIH_VERSION,
CURLOPT_HTTPHEADER => [
'Accept: text/html,application/xhtml+xml',
'Accept: text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
],
]);
return $curl;
@ -107,7 +107,7 @@ final class WebLookup implements \Uiharu\ILookup {
$mediaType = MediaType::parse('application/octet-stream');
}
$isXHTML = $mediaType->equals('application/xhtml+xml');
$isXHTML = $mediaType->equals('application/xhtml+xml') || $mediaType->equals('application/xml');
if($isXHTML || $mediaType->equals('text/html'))
return $this->lookupSite($url, $req, $mediaType, $isXHTML);
@ -123,22 +123,21 @@ final class WebLookup implements \Uiharu\ILookup {
$body = self::reqBody($req);
self::reqClose($req);
$charSet = $mediaType->getCharset();
$document = new DOMDocument;
if($isXHTML) {
$document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
} else {
$document->encoding = $charSet;
$document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
foreach($document->childNodes as $child)
if($child->nodeType === XML_PI_NODE) {
$document->removeChild($child);
break;
}
$document->encoding = $mediaType->getCharset();
}
$charSet = $document->encoding;
$siteInfo = new stdClass;
$siteInfo->title = '';
$siteInfo->metaTitle = '';
@ -150,7 +149,10 @@ final class WebLookup implements \Uiharu\ILookup {
$titleTag = $document->getElementsByTagName('title');
foreach($titleTag as $tag) {
$siteInfo->title = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
$content = trim(mb_convert_encoding($tag->textContent, 'utf-8', $charSet));
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($content)) === 'UTF-8')
$content = $decoded;
$siteInfo->title = $content;
break;
}
@ -167,26 +169,34 @@ final class WebLookup implements \Uiharu\ILookup {
if(empty($nameAttr) || empty($valueAttr))
continue;
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($nameAttr)) === 'UTF-8')
$nameAttr = $decoded;
if(strtolower($charSet) === 'utf-8' && mb_detect_encoding($decoded = utf8_decode($valueAttr)) === 'UTF-8')
$valueAttr = $decoded;
switch($nameAttr) {
case 'og:title':
case 'twitter:title':
$siteInfo->metaTitle = $valueAttr;
if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
$siteInfo->metaTitle = $valueAttr;
break;
case 'description':
case 'og:description':
case 'twitter:description':
if(empty($siteInfo->desc))
if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
$siteInfo->desc = $valueAttr;
break;
case 'og:site_name':
$siteInfo->siteName = $valueAttr;
if(empty($siteInfo->siteName))
$siteInfo->siteName = $valueAttr;
break;
case 'og:image':
case 'twitter:image':
$siteInfo->image = $valueAttr;
if(empty($siteInfo->image))
$siteInfo->image = $valueAttr;
break;
case 'theme-color':