Use NodeJS/Cheerio for HTML parsing.

2023-10-27 22:25:17 +00:00 · 2023-10-27 22:25:17 +00:00 · 6a2060c7a6
parent d90e1e1c0b
commit 6a2060c7a6
5 changed files with 301 additions and 89 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@
 /public/robots.txt
 /lib/index-dev
 /vendor
+/node_modules
--- a/extract.mjs
+++ b/extract.mjs
@ -0,0 +1,75 @@
+import * as cheerio from 'cheerio';
+
+const readStdIn = () => {
+    return new Promise((resolve, reject) => {
+        let stdIn = '';
+
+        process.stdin.on('data', data => stdIn += data);
+        process.stdin.on('end', () => resolve(stdIn));
+        process.stdin.on('error', err => reject(err));
+    });
+};
+
+const $ = cheerio.load(await readStdIn());
+const info = {
+    title: '',
+    metaTitle: '',
+    desc: '',
+    siteName: '',
+    image: '',
+    colour: '',
+    type: 'website',
+};
+
+const titleTag = $('title').first();
+if(titleTag.length > 0)
+    info.title = titleTag.text().trim();
+
+const metaTags = $('meta');
+for(const elemInfo of metaTags) {
+    const elem = $(elemInfo);
+
+    const nameAttr = (elem.attr('name') ?? elem.attr('property') ?? '').trim();
+    if(nameAttr === '')
+        continue;
+
+    const valueAttr = (elem.attr('value') ?? elem.attr('content') ?? '').trim();
+
+    switch(nameAttr) {
+        case 'og:title':
+        case 'twitter:title':
+            if(info.metaTitle === '' || valueAttr.length > info.metaTitle.length)
+                info.metaTitle = valueAttr;
+            break;
+
+        case 'description':
+        case 'og:description':
+        case 'twitter:description':
+            if(info.desc === '' || valueAttr.length > info.desc.length)
+                info.desc = valueAttr;
+            break;
+
+        case 'og:site_name':
+            if(info.siteName === '')
+                info.siteName = valueAttr;
+            break;
+
+        case 'og:image':
+        case 'twitter:image':
+            if(info.image === '')
+                info.image = valueAttr;
+            break;
+
+        case 'theme-color':
+            if(info.colour === '')
+                info.colour = valueAttr;
+            break;
+
+        case 'og:type':
+            if(info.type === '')
+                info.type = `website:${valueAttr}`;
+            break;
+    }
+}
+
+console.log(JSON.stringify(info));
--- a/package-lock.json
+++ b/package-lock.json
@ -0,0 +1,193 @@
+{
+  "name": "uiharu.edgii.net",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "cheerio": "^1.0.0-rc.12"
+      }
+    },
+    "node_modules/boolbase": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
+      "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
+    },
+    "node_modules/cheerio": {
+      "version": "1.0.0-rc.12",
+      "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz",
+      "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==",
+      "dependencies": {
+        "cheerio-select": "^2.1.0",
+        "dom-serializer": "^2.0.0",
+        "domhandler": "^5.0.3",
+        "domutils": "^3.0.1",
+        "htmlparser2": "^8.0.1",
+        "parse5": "^7.0.0",
+        "parse5-htmlparser2-tree-adapter": "^7.0.0"
+      },
+      "engines": {
+        "node": ">= 6"
+      },
+      "funding": {
+        "url": "https://github.com/cheeriojs/cheerio?sponsor=1"
+      }
+    },
+    "node_modules/cheerio-select": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
+      "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
+      "dependencies": {
+        "boolbase": "^1.0.0",
+        "css-select": "^5.1.0",
+        "css-what": "^6.1.0",
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3",
+        "domutils": "^3.0.1"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/fb55"
+      }
+    },
+    "node_modules/css-select": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
+      "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
+      "dependencies": {
+        "boolbase": "^1.0.0",
+        "css-what": "^6.1.0",
+        "domhandler": "^5.0.2",
+        "domutils": "^3.0.1",
+        "nth-check": "^2.0.1"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/fb55"
+      }
+    },
+    "node_modules/css-what": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
+      "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
+      "engines": {
+        "node": ">= 6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/fb55"
+      }
+    },
+    "node_modules/dom-serializer": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
+      "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
+      "dependencies": {
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.2",
+        "entities": "^4.2.0"
+      },
+      "funding": {
+        "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
+      }
+    },
+    "node_modules/domelementtype": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
+      "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fb55"
+        }
+      ]
+    },
+    "node_modules/domhandler": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
+      "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
+      "dependencies": {
+        "domelementtype": "^2.3.0"
+      },
+      "engines": {
+        "node": ">= 4"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domhandler?sponsor=1"
+      }
+    },
+    "node_modules/domutils": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
+      "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
+      "dependencies": {
+        "dom-serializer": "^2.0.0",
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domutils?sponsor=1"
+      }
+    },
+    "node_modules/entities": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
+      "engines": {
+        "node": ">=0.12"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/entities?sponsor=1"
+      }
+    },
+    "node_modules/htmlparser2": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
+      "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
+      "funding": [
+        "https://github.com/fb55/htmlparser2?sponsor=1",
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fb55"
+        }
+      ],
+      "dependencies": {
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3",
+        "domutils": "^3.0.1",
+        "entities": "^4.4.0"
+      }
+    },
+    "node_modules/nth-check": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
+      "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
+      "dependencies": {
+        "boolbase": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/nth-check?sponsor=1"
+      }
+    },
+    "node_modules/parse5": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
+      "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==",
+      "dependencies": {
+        "entities": "^4.4.0"
+      },
+      "funding": {
+        "url": "https://github.com/inikulin/parse5?sponsor=1"
+      }
+    },
+    "node_modules/parse5-htmlparser2-tree-adapter": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz",
+      "integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==",
+      "dependencies": {
+        "domhandler": "^5.0.2",
+        "parse5": "^7.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/inikulin/parse5?sponsor=1"
+      }
+    }
+  }
+}
--- a/package.json
+++ b/package.json
@ -0,0 +1,5 @@
+{
+  "dependencies": {
+    "cheerio": "^1.0.0-rc.12"
+  }
+}
--- a/src/Lookup/WebLookup.php
+++ b/src/Lookup/WebLookup.php
@ -124,99 +124,37 @@ final class WebLookup implements \Uiharu\ILookup {
        $body = self::reqBody($req);
        self::reqClose($req);

-        $charSet = $mediaType->getCharset();
-        $urlHost = $url->getHost();
-        $charSetWrangle = function(string $input) use ($charSet, $urlHost): string {
-            // fuck it
-            if($urlHost === 'pixiv.net' || $urlHost === 'www.pixiv.net') {
-                $decoded = mb_convert_encoding($input, 'ISO-8859-1', 'UTF-8');
-                if(mb_check_encoding($decoded, 'UTF-8') && str_repeat('?', strlen($decoded)) !== $decoded)
-                    return $decoded;
-            }
+        // ok hear me out
+        // there's absolutely no good html scraping libraries for PHP
+        // DOMDocument Exists but kinda blows at catching weird encoding events like with pixiv
+        // and i'm not about to rewrite this whole fucking thing in nodejs
+        // also at this point Index should probably provide a wrapper for proc_open lol
+        $extract = proc_open(
+            sprintf('node %s/extract.mjs', UIH_ROOT),
+            [0 => ['pipe', 'r'], 1 => ['pipe', 'w'], 2 => ['pipe', 'w']],
+            $pipes
+        );
+        if(!is_resource($extract))
+            throw new RuntimeException('Could not open extract.');

-            return $input;
-        };
+        try {
+            fwrite($pipes[0], $body);
+            fclose($pipes[0]);

-        $document = new DOMDocument;
-        if($isXHTML) {
-            $document->loadXML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
-        } else {
-            $document->encoding = $charSet;
-            $document->loadHTML($body, LIBXML_NOERROR | LIBXML_NONET | LIBXML_NOWARNING);
-            foreach($document->childNodes as $child)
-                if($child->nodeType === XML_PI_NODE) {
-                    $document->removeChild($child);
-                    break;
-                }
+            $stderr = trim(stream_get_contents($pipes[2]));
+            if(!empty($stderr))
+                throw new RuntimeException('extract: ' . $stderr);
+
+            $stdout = trim(stream_get_contents($pipes[1]));
+            if(empty($stdout))
+                throw new RuntimeException('extract did not report any errors but exited without any output');
+        } finally {
+            proc_close($extract);
        }

-        $siteInfo = new stdClass;
-        $siteInfo->title = '';
-        $siteInfo->metaTitle = '';
-        $siteInfo->desc = '';
-        $siteInfo->siteName = '';
-        $siteInfo->image = '';
-        $siteInfo->colour = '';
-        $siteInfo->type = 'website';
-
-        $titleTag = $document->getElementsByTagName('title');
-        foreach($titleTag as $tag) {
-            $siteInfo->title = $charSetWrangle(trim($tag->textContent));
-            break;
-        }
-
-        $metaTags = $document->getElementsByTagName('meta');
-        foreach($metaTags as $tag) {
-            $nameAttr = $tag->hasAttribute('name') ? $tag->getAttribute('name') : (
-                $tag->hasAttribute('property') ? $tag->getAttribute('property') : ''
-            );
-            $valueAttr = $tag->hasAttribute('value') ? $tag->getAttribute('value') : (
-                $tag->hasAttribute('content') ? $tag->getAttribute('content') : ''
-            );
-            $nameAttr  = trim(mb_convert_encoding($nameAttr,  'utf-8', $charSet));
-            $valueAttr = trim(mb_convert_encoding($valueAttr, 'utf-8', $charSet));
-            if(empty($nameAttr) || empty($valueAttr))
-                continue;
-
-            $nameAttr = $charSetWrangle($nameAttr);
-            $valueAttr = $charSetWrangle($valueAttr);
-
-            switch($nameAttr) {
-                case 'og:title':
-                case 'twitter:title':
-                    if(empty($siteInfo->metaTitle) || strlen($valueAttr) > strlen($siteInfo->metaTitle))
-                        $siteInfo->metaTitle = $valueAttr;
-                    break;
-
-                case 'description':
-                case 'og:description':
-                case 'twitter:description':
-                    if(empty($siteInfo->desc) || strlen($valueAttr) > strlen($siteInfo->desc))
-                        $siteInfo->desc = $valueAttr;
-                    break;
-
-                case 'og:site_name':
-                    if(empty($siteInfo->siteName))
-                        $siteInfo->siteName = $valueAttr;
-                    break;
-
-                case 'og:image':
-                case 'twitter:image':
-                    if(empty($siteInfo->image))
-                        $siteInfo->image = $valueAttr;
-                    break;
-
-                case 'theme-color':
-                    if(empty($siteInfo->colour))
-                        $siteInfo->colour = $valueAttr;
-                    break;
-
-                case 'og:type':
-                    if(empty($siteInfo->type))
-                        $siteInfo->type = 'website:' . $valueAttr;
-                    break;
-            }
-        }
+        $siteInfo = json_decode($stdout);
+        if(empty($siteInfo))
+            throw new RuntimeException('Failed to parse extract output.');

        return new WebLookupSiteResult($url, $mediaType, $siteInfo);
    }