PHP Classes

File: src/Search/Bitap/search.php

Recommend this page to a friend!
  Classes of AccountKiller   Fuse   src/Search/Bitap/search.php   Download  
File: src/Search/Bitap/search.php
Role: Example script
Content type: text/plain
Description: Example script
Class: Fuse
Fuzzy search of arrays using the Bitap algorithm
Author: By
Last change:
Date: 2 years ago
Size: 6,696 bytes
 

Contents

Class file image Download
<?php

namespace Fuse\Search\Bitap;

use
Fuse\Exception\PatternLengthTooLargeException;
use
Fuse\Search\Bitap\Constants;

use function
Fuse\Search\Bitap\computeScore;
use function
Fuse\Search\Bitap\convertMaskToIndices;
use function
Fuse\Core\config;

/**
 * @return (array|bool|float|int|mixed)[]
 *
 * @psalm-return array{isMatch: bool, score: 1|float|mixed, indices?: array}
 */
function search(string $text, string $pattern, array $patternAlphabet, array $options = []): array
{
   
$location = $options['location'] ?? config('location');
   
$distance = $options['distance'] ?? config('distance');
   
$threshold = $options['threshold'] ?? config('threshold');
   
$findAllMatches = $options['findAllMatches'] ?? config('findAllMatches');
   
$minMatchCharLength = $options['minMatchCharLength'] ?? config('minMatchCharLength');
   
$includeMatches = $options['includeMatches'] ?? config('includeMatches');
   
$ignoreLocation = $options['ignoreLocation'] ?? config('ignoreLocation');

    if (
mb_strlen($pattern) > Constants::MAX_BITS) {
        throw new
PatternLengthTooLargeException(Constants::MAX_BITS);
    }

   
$patternLen = mb_strlen($pattern);

   
// Set starting location at beginning text and initialize the alphabet.
   
$textLen = mb_strlen($text);

   
// Handle the case when location > text.length
   
$expectedLocation = max(0, min($location, $textLen));

   
// Highest score beyond which we give up.
   
$currentThreshold = $threshold;

   
// Is there a nearby exact match? (speedup)
   
$bestLocation = $expectedLocation;

   
// Performance: only computer matches when the minMatchCharLength > 1
    // OR if `includeMatches` is true.
   
$computeMatches = $minMatchCharLength > 1 || $includeMatches;

   
// A mask of the matches, used for building the indices
   
$matchMask = $computeMatches ? [$textLen] : [];

   
// Get all exact matches, here for speed up
   
while (($index = mb_strpos($text, $pattern, $bestLocation)) !== false) {
       
$score = computeScore($pattern, [
           
'currentLocation' => $index,
           
'expectedLocation' => $expectedLocation,
           
'distance' => $distance,
           
'ignoreLocation' => $ignoreLocation,
        ]);

       
$currentThreshold = min($score, $currentThreshold);
       
$bestLocation = $index + $patternLen;

        if (
$computeMatches) {
           
$i = 0;
            while (
$i < $patternLen) {
               
$matchMask[$index + $i] = 1;
               
$i += 1;
            }
        }
    }

   
// Reset the best location
   
$bestLocation = -1;

   
$lastBitArr = [];
   
$finalScore = 1;
   
$binMax = $patternLen + $textLen;

   
$mask = 1 << $patternLen - 1;

    for (
$i = 0; $i < $patternLen; $i += 1) {
       
// Scan for the best match; each iteration allows for one more error.
        // Run a binary search to determine how far from the match location we can stray
        // at this error level.
       
$binMin = 0;
       
$binMid = $binMax;

        while (
$binMin < $binMid) {
           
$score = computeScore($pattern, [
               
'errors' => $i,
               
'currentLocation' => $expectedLocation + $binMid,
               
'expectedLocation' => $expectedLocation,
               
'distance' => $distance,
               
'ignoreLocation' => $ignoreLocation,
            ]);

            if (
$score <= $currentThreshold) {
               
$binMin = $binMid;
            } else {
               
$binMax = $binMid;
            }

           
$binMid = floor(($binMax - $binMin) / 2 + $binMin);
        }

       
// Use the result from this iteration as the maximum for the next.
       
$binMax = $binMid;

       
$start = max(1, $expectedLocation - $binMid + 1);
       
$finish = $findAllMatches
           
? $textLen
           
: min($expectedLocation + $binMid, $textLen) + $patternLen;

       
// Initialize the bit array
       
$bitArr = [$finish + 2];

       
$bitArr[$finish + 1] = (1 << $i) - 1;

        for (
$j = $finish; $j >= $start; $j -= 1) {
           
$currentLocation = $j - 1;
           
$charMatch = $patternAlphabet[mb_substr($text, $currentLocation, 1)] ?? null;

            if (
$computeMatches) {
               
// Speed up: quick bool to int conversion (i.e, `charMatch ? 1 : 0`)
               
$matchMask[$currentLocation] = (int) $charMatch;
            }

           
// First pass: exact match
           
$bitArr[$j] = ((($bitArr[$j + 1] ?? 0) << 1) | 1) & $charMatch;

           
// Subsequent passes: fuzzy match
           
if ($i) {
               
$bitArr[$j] |=
                    (((
$lastBitArr[$j + 1] ?? 0) | ($lastBitArr[$j] ?? 0)) << 1) |
                   
1 |
                    (
$lastBitArr[$j + 1] ?? 0);
            }

            if (
$bitArr[$j] & $mask) {
               
$finalScore = computeScore($pattern, [
                   
'errors' => $i,
                   
'currentLocation' => $currentLocation,
                   
'expectedLocation' => $expectedLocation,
                   
'distance' => $distance,
                   
'ignoreLocation' => $ignoreLocation,
                ]);

               
// This match will almost certainly be better than any existing match.
                // But check anyway.
               
if ($finalScore <= $currentThreshold) {
                   
// Indeed it is
                   
$currentThreshold = $finalScore;
                   
$bestLocation = $currentLocation;

                   
// Already passed `loc`, downhill from here on in.
                   
if ($bestLocation <= $expectedLocation) {
                        break;
                    }

                   
// When passing `bestLocation`, don't exceed our current distance from `expectedLocation`.
                   
$start = max(1, 2 * $expectedLocation - $bestLocation);
                }
            }
        }

       
// No hope for a (better) match at greater error levels.
       
$score = computeScore($pattern, [
           
'errors' => $i + 1,
           
'currentLocation' => $expectedLocation,
           
'expectedLocation' => $expectedLocation,
           
'distance' => $distance,
           
'ignoreLocation' => $ignoreLocation,
        ]);

        if (
$score > $currentThreshold) {
            break;
        }

       
$lastBitArr = $bitArr;
    }

   
$result = [
       
'isMatch' => $bestLocation >= 0,
       
'score' => max(0.001, $finalScore),
    ];

    if (
$computeMatches) {
       
$indices = convertMaskToIndices($matchMask, $minMatchCharLength);
        if (
sizeof($indices) === 0) {
           
$result['isMatch'] = false;
        } elseif (
$includeMatches) {
           
$result['indices'] = $indices;
        }
    }

    return
$result;
}