From fb5fd572384b8294e6b65746db252abe1c6fad3c Mon Sep 17 00:00:00 2001 From: soffee Date: Tue, 8 Apr 2025 21:33:12 +0300 Subject: [PATCH] add better handling of word patterns --- keywords.yml.example | 15 +++++++++ package.json | 2 +- src/config.ts | 21 +++++++----- src/keywords.ts | 78 +++++++++++++++++++++++++------------------- src/main.ts | 5 +-- src/metrics.ts | 21 +++++++++++- src/utils.ts | 3 ++ 7 files changed, 99 insertions(+), 46 deletions(-) create mode 100644 src/utils.ts diff --git a/keywords.yml.example b/keywords.yml.example index deddb34..feeceef 100644 --- a/keywords.yml.example +++ b/keywords.yml.example @@ -1,4 +1,19 @@ keywords: + # will match just word 'meow' (requires word border on both sides) - meow + + # will match plain regex - name: woof pattern: 'w[oa]+f' + + # will match regex wraped with word borders + # will match 'hi, woof :3', 'woof!', 'i heard a woof' but not 'i like subwoofers' + - name: woof + pattern: 'w[oa]+f' + word: true + + # will match any word starting with 'aqua' (aquarium, aquatic, aquaculture, etc...) + # requires word border on both sides too + - name: aqua + pattern: 'aqua.*?' + word: true diff --git a/package.json b/package.json index fb93db9..a0e5b1e 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "mtproto_exporter", "type": "module", - "version": "1.1.0", + "version": "1.2.0", "packageManager": "pnpm@10.6.5", "license": "MIT", "scripts": { diff --git a/src/config.ts b/src/config.ts index 86c02f2..fa8363d 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,5 +1,5 @@ import type { OptionDefinition } from "command-line-args"; -import type { KeywordLike } from "./keywords.js"; +import type { RawKeywordLike } from "./keywords.js"; import { readFile } from "node:fs/promises"; import cmdline from "command-line-args"; import yaml from "js-yaml"; @@ -12,7 +12,7 @@ export interface Configuration { watchFile: boolean; includePeers?: number[]; excludePeers?: number[]; - keywords?: KeywordLike[]; + keywords?: RawKeywordLike[]; } const optionDefinitions: OptionDefinition[] = [ @@ -54,19 +54,22 @@ if (cli["exclude-peers"]) { } } -export async function readKeywords(filePath: string): Promise { +export async function readKeywords(filePath: string): Promise { const doc = yaml.load(await readFile(filePath, "utf8")) as { keywords?: any[] }; if (doc.keywords && doc.keywords.constructor.name === "Array") { - const keywords: KeywordLike[] = []; + const keywords: RawKeywordLike[] = []; for (const item of doc.keywords) { if (typeof item === "string") { keywords.push(item); - } else if (typeof item === "object" && item.name && item.pattern) { - keywords.push({ - name: item.name, - pattern: new RegExp(item.pattern, "gi"), - }); + } else if (typeof item === "object" && typeof item.name === "string") { + if (typeof item.pattern === "string") { + keywords.push({ + name: item.name, + pattern: item.pattern, + word: Boolean(item.word ?? false), + }); + } } } return keywords; diff --git a/src/keywords.ts b/src/keywords.ts index fdb0a79..ff26ece 100644 --- a/src/keywords.ts +++ b/src/keywords.ts @@ -3,37 +3,55 @@ import { PropagationAction } from "@mtcute/dispatcher"; import { Counter } from "prom-client"; import { config } from "./config.js"; import { peersConfigFilter } from "./filters.js"; +import { escapeRegex } from "./utils.js"; -interface KeywordPattern { +export interface RawKeywordPattern { + name: string; + pattern: string; + word: boolean; +} + +export type RawKeywordLike = string | RawKeywordPattern; + +export interface KeywordPattern { name: string; pattern: RegExp; } -export type KeywordLike = string | KeywordPattern; +export function rawToPatterns(raw: RawKeywordLike[]): KeywordPattern[] { + const patterns: KeywordPattern[] = []; + for (const keyword of raw) { + let pattern; + let name; + let addBorders = false; -export function newWordsCounter(dp: Dispatcher) { - const counter = new Counter({ - name: "messenger_dialog_words_count", - help: "Number of words in messages since exporter startup", - labelNames: ["peerId", "word"], - }); - dp.onNewMessage(peersConfigFilter(config), async (msg) => { - const words = msg.text.toLowerCase().split(" "); - for (const w of words) { - counter.inc({ - peerId: msg.chat.id, - word: w, - }); + if (typeof keyword === "string") { + pattern = escapeRegex(keyword); + name = keyword; + addBorders = true; + } else { + pattern = keyword.pattern; + name = keyword.name; + addBorders = keyword.word; } - return PropagationAction.Continue; - }); - return counter; + + const wordBorder = escapeRegex("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"); + const borderStart = addBorders ? `(?:[${wordBorder}\\s]|^)` : ""; + const borderEnd = addBorders ? `(?:[${wordBorder}\\s]|$)` : ""; + + patterns.push({ + name, + pattern: new RegExp(borderStart + pattern + borderEnd), + }); + } + + return patterns; } export class KeywordsCounter extends Counter { private _dp: Dispatcher; - private _keywords: KeywordLike[]; - constructor(dp: Dispatcher, keywords: KeywordLike[] = []) { + private _keywords: KeywordPattern[]; + constructor(dp: Dispatcher, keywords: KeywordPattern[] = []) { super({ name: "messenger_dialog_keywords_count", help: "Number of keywords found in messages since exporter startup", @@ -42,24 +60,18 @@ export class KeywordsCounter extends Counter { this._dp = dp; this._keywords = keywords; - dp.onNewMessage(peersConfigFilter(config), async (msg) => { + this._dp.onNewMessage(peersConfigFilter(config), async (msg) => { for (const kw of this._keywords) { - let count; - let kwname; - if (typeof kw === "string") { - const words = msg.text.toLowerCase().split(" "); - count = words.filter(w => w === kw).length; - kwname = kw; - } else { - count = (msg.text.match(kw.pattern) || []).length; - kwname = kw.name; - } + const count = (msg.text.match(kw.pattern) ?? []).length; + + // this will prevent from flooding metrics with keywords that had never been triggered yet if (count === 0) { continue; } + this.inc({ peerId: msg.chat.id, - keyword: kwname, + keyword: kw.name, }, count); } return PropagationAction.Continue; @@ -70,7 +82,7 @@ export class KeywordsCounter extends Counter { return this._keywords; } - public setKeywords(keywords: KeywordLike[]) { + public setKeywords(keywords: KeywordPattern[]) { this._keywords = keywords; } } diff --git a/src/main.ts b/src/main.ts index 7de6300..53db075 100644 --- a/src/main.ts +++ b/src/main.ts @@ -5,6 +5,7 @@ import { collectDefaultMetrics, Registry } from "prom-client"; import { config, readKeywords } from "./config.js"; import * as env from "./env.js"; +import { rawToPatterns } from "./keywords.js"; import * as metrics from "./metrics.js"; import MetricsServer from "./server.js"; @@ -36,7 +37,7 @@ registry.registerMetric(metrics.newUnreadCountGauge(tg)); registry.registerMetric(metrics.newMessagesCounter(dp)); if (config.keywords) { - const counter = new metrics.KeywordsCounter(dp, config.keywords); + const counter = new metrics.KeywordsCounter(dp, rawToPatterns(config.keywords)); registry.registerMetric(counter); if (config.watchFile) { @@ -47,7 +48,7 @@ if (config.keywords) { console.log("[watch-file] Keywords file was updated. Re-reading keywords configuration..."); try { config.keywords = await readKeywords(config.keywordsFile); - counter.setKeywords(config.keywords); + counter.setKeywords(rawToPatterns(config.keywords)); } catch (e) { console.error("Failed to read keywords file", config.keywordsFile, e); } diff --git a/src/metrics.ts b/src/metrics.ts index 9faf0db..daa9a55 100644 --- a/src/metrics.ts +++ b/src/metrics.ts @@ -5,7 +5,7 @@ import { Counter, Gauge } from "prom-client"; import { config } from "./config.js"; import { peersConfigBoolFilter, peersConfigFilter } from "./filters.js"; -import { KeywordsCounter, newWordsCounter } from "./keywords.js"; +import { KeywordsCounter } from "./keywords.js"; function newMessagesCounter(dp: Dispatcher) { const counter = new Counter({ @@ -67,6 +67,25 @@ function newUnreadCountGauge(tg: TelegramClient) { return gauge; } +function newWordsCounter(dp: Dispatcher) { + const counter = new Counter({ + name: "messenger_dialog_words_count", + help: "Number of words in messages since exporter startup", + labelNames: ["peerId", "word"], + }); + dp.onNewMessage(peersConfigFilter(config), async (msg) => { + const words = msg.text.toLowerCase().split(" "); + for (const w of words) { + counter.inc({ + peerId: msg.chat.id, + word: w, + }); + } + return PropagationAction.Continue; + }); + return counter; +} + export { KeywordsCounter, newMessagesCounter, diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..fcb122e --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,3 @@ +export function escapeRegex(text: string) { + return text.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&"); +}