misuzu/devel/MarkovDictionary.php

116 lines
3.2 KiB
PHP

<?php
class MarkovDictionary {
private const MAGIC = 'FMkD';
private const VERSION = 1;
private $handle;
private int $segmentSize;
private int $totalSegments;
private int $startSegments;
public function __construct(string $path) {
if(!is_file($path))
throw new InvalidArgumentException('$path does not exist.');
$this->handle = $handle = fopen($path, 'rb');
$magic = fread($handle, 4);
if($magic !== self::MAGIC)
throw new InvalidArgumentException('$path is not a valid markov dictionary.');
$header = fread($handle, 12);
if(strlen($header) !== 12)
throw new InvalidArgumentException('$path is missing header data.');
extract(unpack('Cversion/Cunused1/Cunused2/CsegmentSize/VtotalSegments/VstartSegments', $header));
if($version !== self::VERSION)
throw new InvalidArgumentException('$path version is incompatible.');
$this->segmentSize = $segmentSize;
$this->totalSegments = $totalSegments;
$this->startSegments = $startSegments;
}
public function close(): void {
if($this->handle !== null) {
fclose($this->handle);
$this->handle = null;
}
}
public function __destruct() {
$this->close();
}
private function reset(): void {
fseek($this->handle, 16, SEEK_SET);
}
public function getStartPosition(): int {
$randomStart = mt_rand(0, $this->startSegments) - 2;
if($randomStart > 0) {
for(;;) {
fseek($this->handle, 4 * $this->segmentSize, SEEK_CUR);
$isStart = fgetc($this->handle) !== "\0";
if($isStart) {
if($randomStart < 1)
break;
--$randomStart;
}
extract(unpack('vnextSegments', fread($this->handle, 2)));
fseek($this->handle, 6 * $nextSegments, SEEK_CUR);
}
fseek($this->handle, -(4 * $this->segmentSize) - 1, SEEK_CUR);
}
$startPos = ftell($this->handle);
$this->reset();
return $startPos;
}
public function generate(int $safety = 2000, int $start = -1): string {
if($start < 0)
$start = $this->getStartPosition();
fseek($this->handle, $start, SEEK_SET);
$string = '';
for($s = 0; $s < $safety; ++$s) {
$string .= fread($this->handle, 4 * $this->segmentSize);
fseek($this->handle, 1, SEEK_CUR);
extract(unpack('vnextSegments', fread($this->handle, 2)));
if($nextSegments < 1)
break;
$nexts = [];
// really shitty weighting system
for($i = 0; $i < $nextSegments; ++$i) {
extract(unpack('Voffset/vweight', fread($this->handle, 6)));
for($j = 0; $j < $weight; ++$j)
$nexts[] = $offset;
}
$offset = $nexts[array_rand($nexts)];
fseek($this->handle, $offset, SEEK_SET);
}
$this->reset();
$string = mb_convert_encoding($string, 'utf-8', 'utf-32le');
return trim($string);
}
}