TM-SGNL-iOS/SignalServiceKit/Storage/FullTextSearchIndexer.swift
TeleMessage developers dde0620daf initial commit
2025-05-03 12:28:28 -07:00

277 lines
11 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// Copyright 2018 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
//
import Foundation
import GRDB
public enum FullTextSearchIndexer {
public static let matchTag = "match"
// MARK: - Normalization
private static var charactersToRemove: CharacterSet = {
// * We want to strip punctuation - and our definition of "punctuation"
// is broader than `CharacterSet.punctuationCharacters`.
// * FTS should be robust to (i.e. ignore) illegal and control characters,
// but it's safer if we filter them ourselves as well.
var charactersToFilter = CharacterSet.punctuationCharacters
charactersToFilter.formUnion(CharacterSet.illegalCharacters)
charactersToFilter.formUnion(CharacterSet.controlCharacters)
// We want to strip all ASCII characters except:
// * Letters a-z, A-Z
// * Numerals 0-9
// * Whitespace
var asciiToFilter = CharacterSet(charactersIn: UnicodeScalar(0x0)!..<UnicodeScalar(0x80)!)
assert(!asciiToFilter.contains(UnicodeScalar(0x80)!))
asciiToFilter.subtract(CharacterSet.alphanumerics)
asciiToFilter.subtract(CharacterSet.whitespacesAndNewlines)
charactersToFilter.formUnion(asciiToFilter)
return charactersToFilter
}()
// This is a hot method, especially while running large migrations.
// Changes to it should go through a profiler to make sure large migrations
// aren't adversely affected.
public static func normalizeText(_ text: String) -> String {
// 1. Filter out invalid characters.
let filtered = text.removeCharacters(characterSet: charactersToRemove)
// 2. Simplify whitespace.
let simplified = filtered.replaceCharacters(
characterSet: .whitespacesAndNewlines,
replacement: " "
)
// 3. Strip leading & trailing whitespace last, since we may replace
// filtered characters with whitespace.
let trimmed = simplified.trimmingCharacters(in: .whitespacesAndNewlines)
// 4. Use canonical mapping.
//
// From the GRDB docs:
//
// Generally speaking, matches may fail when content and query dont use
// the same unicode normalization. SQLite actually exhibits inconsistent
// behavior in this regard.
//
// For example, for aimé to match aimé, they better have the same
// normalization: the NFC aim\u{00E9} form may not match its NFD aime\u{0301}
// equivalent. Most strings that you get from Swift, UIKit and Cocoa use NFC,
// so be careful with NFD inputs (such as strings from the HFS+ file system,
// or strings that you cant trust like network inputs). Use
// String.precomposedStringWithCanonicalMapping to turn a string into NFC.
//
// Besides, if you want fi to match the ligature (U+FB01), then you need
// to normalize your indexed contents and inputs to NFKC or NFKD. Use
// String.precomposedStringWithCompatibilityMapping to turn a string into NFKC.
let canonical = trimmed.precomposedStringWithCanonicalMapping
return canonical
}
// MARK: - Querying
// We want to match by prefix for "search as you type" functionality.
// SQLite does not support suffix or contains matches.
public static func buildQuery(for searchText: String) -> String {
// 1. Normalize the search text.
//
// TODO: We could arguably convert to lowercase since the search
// is case-insensitive.
let normalizedSearchText = normalizeText(searchText)
// 2. Split the non-numeric text into query terms (or tokens).
let nonNumericText = String(String.UnicodeScalarView(normalizedSearchText.unicodeScalars.lazy.map {
if CharacterSet.decimalDigits.contains($0) {
return " "
} else {
return $0
}
}))
var queryTerms = nonNumericText.split(separator: " ")
// 3. Add an additional numeric-only query term.
let digitsOnlyScalars = normalizedSearchText.unicodeScalars.lazy.filter {
CharacterSet.decimalDigits.contains($0)
}
let digitsOnly: Substring = Substring(String(String.UnicodeScalarView(digitsOnlyScalars)))
queryTerms.append(digitsOnly)
// 4. De-duplicate and sort query terms.
// Duplicate terms are redundant.
// Sorting terms makes the output of this method deterministic and easier to test,
// and the order won't affect the search results.
queryTerms = Array(Set(queryTerms)).sorted()
// 5. Filter the query terms.
let filteredQueryTerms = queryTerms.filter {
// Ignore empty terms.
$0.count > 0
}.map {
// Allow partial match of each term.
//
// Note that we use double-quotes to enclose each search term.
// Quoted search terms can include a few more characters than
// "bareword" (non-quoted) search terms. This shouldn't matter,
// since we're filtering all of the affected characters, but
// quoting protects us from any bugs in that logic.
"\"\($0)\"*"
}
// 6. Join terms into query string.
let query = filteredQueryTerms.joined(separator: " ")
return query
}
}
// MARK: - Message Index
// See: http://groue.github.io/GRDB.swift/docs/4.1/index.html#full-text-search
// See: https://www.sqlite.org/fts5.html
extension FullTextSearchIndexer {
public static let ftsTableName = "indexable_text_fts"
static let contentTableName = "indexable_text"
static let uniqueIdColumn = "uniqueId"
static let collectionColumn = "collection"
static let ftsContentColumn = "ftsIndexableContent"
private static let legacyCollectionName = "TSInteraction"
private static func indexableContent(for message: TSMessage, tx: SDSAnyReadTransaction) -> String? {
guard !message.isViewOnceMessage else {
// Don't index "view-once messages".
return nil
}
guard !message.isGroupStoryReply else {
return nil
}
guard message.editState != .pastRevision else {
return nil
}
guard let bodyText = message.rawBody(transaction: tx) else {
return nil
}
return normalizeText(bodyText)
}
public static func insert(_ message: TSMessage, tx: SDSAnyWriteTransaction) throws {
guard let ftsContent = indexableContent(for: message, tx: tx) else {
return
}
try executeUpdate(
sql: """
INSERT INTO \(contentTableName)
(\(collectionColumn), \(uniqueIdColumn), \(ftsContentColumn))
VALUES
(?, ?, ?)
""",
arguments: [legacyCollectionName, message.uniqueId, ftsContent],
tx: tx
)
}
public static func update(_ message: TSMessage, tx: SDSAnyWriteTransaction) throws {
try delete(message, tx: tx)
try insert(message, tx: tx)
}
public static func delete(_ message: TSMessage, tx: SDSAnyWriteTransaction) throws {
try executeUpdate(
sql: """
DELETE FROM \(contentTableName)
WHERE \(uniqueIdColumn) == ?
AND \(collectionColumn) == ?
""",
arguments: [message.uniqueId, legacyCollectionName],
tx: tx
)
}
private static func executeUpdate(
sql: String,
arguments: StatementArguments,
tx: SDSAnyWriteTransaction
) throws {
let database = tx.unwrapGrdbWrite.database
do {
let statement = try database.cachedStatement(sql: sql)
try statement.setArguments(arguments)
try statement.execute()
} catch {
DatabaseCorruptionState.flagDatabaseCorruptionIfNecessary(
userDefaults: CurrentAppContext().appUserDefaults(),
error: error
)
throw error
}
}
// MARK: - Querying
public static func search(
for searchText: String,
maxResults: Int,
tx: SDSAnyReadTransaction,
block: (_ message: TSMessage, _ snippet: String, _ stop: inout Bool) -> Void
) {
let query = buildQuery(for: searchText)
if query.isEmpty {
// FullTextSearchFinder.query filters some characters, so query
// may now be empty.
Logger.warn("Empty query.")
return
}
// Search with the query interface or SQL
do {
// GRDB TODO: We could use bm25() instead of rank to order results.
let indexOfContentColumnInFTSTable = 0
// Determines the length of the snippet.
let numTokens: UInt = 15
let matchSnippet = "match_snippet"
let sql: String = """
SELECT
\(contentTableName).\(collectionColumn),
\(contentTableName).\(uniqueIdColumn),
SNIPPET(\(ftsTableName), \(indexOfContentColumnInFTSTable), '<\(matchTag)>', '</\(matchTag)>', '…', \(numTokens)) AS \(matchSnippet)
FROM \(ftsTableName)
LEFT JOIN \(contentTableName) ON \(contentTableName).rowId = \(ftsTableName).rowId
WHERE \(ftsTableName).\(ftsContentColumn) MATCH ?
ORDER BY rank
LIMIT \(maxResults)
"""
let cursor = try Row.fetchCursor(tx.unwrapGrdbRead.database, sql: sql, arguments: [query])
while let row = try cursor.next() {
let collection: String = row[collectionColumn]
guard collection == legacyCollectionName else {
owsFailDebug("Found something other than a message in the FTS table")
continue
}
guard let uniqueId = (row[uniqueIdColumn] as String).nilIfEmpty else {
owsFailDebug("Found a message with a uniqueId in the FTS table")
continue
}
let snippet: String = row[matchSnippet]
guard let message = TSMessage.anyFetchMessage(uniqueId: uniqueId, transaction: tx) else {
owsFailDebug("Couldn't find message that exists in the FTS table")
continue
}
var stop = false
block(message, snippet, &stop)
if stop {
break
}
}
} catch {
owsFailDebug("Couldn't fetch results: \(error.grdbErrorForLogging)")
}
}
}