From 6a2060c7a6807f075ebf788c3a3cad710e24c78b Mon Sep 17 00:00:00 2001 From: flashwave Date: Fri, 27 Oct 2023 22:25:17 +0000 Subject: [PATCH] Use NodeJS/Cheerio for HTML parsing. --- .gitignore | 1 + extract.mjs | 75 +++++++++++++++ package-lock.json | 193 +++++++++++++++++++++++++++++++++++++++ package.json | 5 + src/Lookup/WebLookup.php | 116 ++++++----------------- 5 files changed, 301 insertions(+), 89 deletions(-) create mode 100644 extract.mjs create mode 100644 package-lock.json create mode 100644 package.json diff --git a/.gitignore b/.gitignore index 4d57da9..285e197 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ /public/robots.txt /lib/index-dev /vendor +/node_modules diff --git a/extract.mjs b/extract.mjs new file mode 100644 index 0000000..73ce1cc --- /dev/null +++ b/extract.mjs @@ -0,0 +1,75 @@ +import * as cheerio from 'cheerio'; + +const readStdIn = () => { + return new Promise((resolve, reject) => { + let stdIn = ''; + + process.stdin.on('data', data => stdIn += data); + process.stdin.on('end', () => resolve(stdIn)); + process.stdin.on('error', err => reject(err)); + }); +}; + +const $ = cheerio.load(await readStdIn()); +const info = { + title: '', + metaTitle: '', + desc: '', + siteName: '', + image: '', + colour: '', + type: 'website', +}; + +const titleTag = $('title').first(); +if(titleTag.length > 0) + info.title = titleTag.text().trim(); + +const metaTags = $('meta'); +for(const elemInfo of metaTags) { + const elem = $(elemInfo); + + const nameAttr = (elem.attr('name') ?? elem.attr('property') ?? '').trim(); + if(nameAttr === '') + continue; + + const valueAttr = (elem.attr('value') ?? elem.attr('content') ?? '').trim(); + + switch(nameAttr) { + case 'og:title': + case 'twitter:title': + if(info.metaTitle === '' || valueAttr.length > info.metaTitle.length) + info.metaTitle = valueAttr; + break; + + case 'description': + case 'og:description': + case 'twitter:description': + if(info.desc === '' || valueAttr.length > info.desc.length) + info.desc = valueAttr; + break; + + case 'og:site_name': + if(info.siteName === '') + info.siteName = valueAttr; + break; + + case 'og:image': + case 'twitter:image': + if(info.image === '') + info.image = valueAttr; + break; + + case 'theme-color': + if(info.colour === '') + info.colour = valueAttr; + break; + + case 'og:type': + if(info.type === '') + info.type = `website:${valueAttr}`; + break; + } +} + +console.log(JSON.stringify(info)); diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..d272f78 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,193 @@ +{ + "name": "uiharu.edgii.net", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "dependencies": { + "cheerio": "^1.0.0-rc.12" + } + }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, + "node_modules/cheerio": { + "version": "1.0.0-rc.12", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", + "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "htmlparser2": "^8.0.1", + "parse5": "^7.0.0", + "parse5-htmlparser2-tree-adapter": "^7.0.0" + }, + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", + "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, + "node_modules/parse5": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", + "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==", + "dependencies": { + "entities": "^4.4.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz", + "integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==", + "dependencies": { + "domhandler": "^5.0.2", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..573f2cd --- /dev/null +++ b/package.json @@ -0,0 +1,5 @@ +{ + "dependencies": { + "cheerio": "^1.0.0-rc.12" + } +} diff --git a/src/Lookup/WebLookup.php b/src/Lookup/WebLookup.php index 0bb60aa..b68804a 100644 --- a/src/Lookup/WebLookup.php +++ b/src/Lookup/WebLookup.php @@ -124,99 +124,37 @@ final class WebLookup implements \Uiharu\ILookup { $body = self::reqBody($req); self::reqClose($req); - $charSet = $mediaType->getCharset(); - $urlHost = $url->getHost(); - $charSetWrangle = function(string $input) use ($charSet, $urlHost): string { - // fuck it - if($urlHost === 'pixiv.net' || $urlHost === 'www.pixiv.net') { - $decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8'); - if(mb_check_encoding($decoded, 'UTF-8') && str_repeat('?', strlen($decoded)) !== $decoded) - return $decoded; - } + // ok hear me out + // there's absolutely no good html scraping libraries for PHP + // DOMDocument Exists but kinda blows at catching weird encoding events like with pixiv + // and i'm not about to rewrite this whole fucking thing in nodejs + // also at this point Index should probably provide a wrapper for proc_open lol + $extract = proc_open( + sprintf('node %s/extract.mjs', UIH_ROOT), + [0 => ['pipe', 'r'], 1 => ['pipe', 'w'], 2 => ['pipe', 'w']], + $pipes + ); + if(!is_resource($extract)) + throw new RuntimeException('Could not open extract.'); - return $input; - }; + try { + fwrite($pipes[0], $body); + fclose($pipes[0]); - $document = new DOMDocument; - if($isXHTML) { - $document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); - } else { - $document->encoding = $charSet; - $document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING); - foreach($document->childNodes as $child) - if($child->nodeType === XML_PI_NODE) { - $document->removeChild($child); - break; - } + $stderr = trim(stream_get_contents($pipes[2])); + if(!empty($stderr)) + throw new RuntimeException('extract: ' . $stderr); + + $stdout = trim(stream_get_contents($pipes[1])); + if(empty($stdout)) + throw new RuntimeException('extract did not report any errors but exited without any output'); + } finally { + proc_close($extract); } - $siteInfo = new stdClass; - $siteInfo->title = ''; - $siteInfo->metaTitle = ''; - $siteInfo->desc = ''; - $siteInfo->siteName = ''; - $siteInfo->image = ''; - $siteInfo->colour = ''; - $siteInfo->type = 'website'; - - $titleTag = $document->getElementsByTagName('title'); - foreach($titleTag as $tag) { - $siteInfo->title = $charSetWrangle(trim($tag->textContent)); - break; - } - - $metaTags = $document->getElementsByTagName('meta'); - foreach($metaTags as $tag) { - $nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : ( - $tag->hasAttribute('property') ? $tag->getAttribute('property') : '' - ); - $valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : ( - $tag->hasAttribute('content') ? $tag->getAttribute('content') : '' - ); - $nameAttr = trim(mb_convert_encoding($nameAttr, 'utf-8', $charSet)); - $valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet)); - if(empty($nameAttr) || empty($valueAttr)) - continue; - - $nameAttr = $charSetWrangle($nameAttr); - $valueAttr = $charSetWrangle($valueAttr); - - switch($nameAttr) { - case 'og:title': - case 'twitter:title': - if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle)) - $siteInfo->metaTitle = $valueAttr; - break; - - case 'description': - case 'og:description': - case 'twitter:description': - if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc)) - $siteInfo->desc = $valueAttr; - break; - - case 'og:site_name': - if(empty($siteInfo->siteName)) - $siteInfo->siteName = $valueAttr; - break; - - case 'og:image': - case 'twitter:image': - if(empty($siteInfo->image)) - $siteInfo->image = $valueAttr; - break; - - case 'theme-color': - if(empty($siteInfo->colour)) - $siteInfo->colour = $valueAttr; - break; - - case 'og:type': - if(empty($siteInfo->type)) - $siteInfo->type = 'website:' . $valueAttr; - break; - } - } + $siteInfo = json_decode($stdout); + if(empty($siteInfo)) + throw new RuntimeException('Failed to parse extract output.'); return new WebLookupSiteResult($url, $mediaType, $siteInfo); }