add better handling of word patterns

This commit is contained in:
soffee 2025-04-08 21:33:12 +03:00
parent 69695e265a
commit fb5fd57238
7 changed files with 99 additions and 46 deletions

View file

@ -1,4 +1,19 @@
keywords: keywords:
# will match just word 'meow' (requires word border on both sides)
- meow - meow
# will match plain regex
- name: woof - name: woof
pattern: 'w[oa]+f' pattern: 'w[oa]+f'
# will match regex wraped with word borders
# will match 'hi, woof :3', 'woof!', 'i heard a woof' but not 'i like subwoofers'
- name: woof
pattern: 'w[oa]+f'
word: true
# will match any word starting with 'aqua' (aquarium, aquatic, aquaculture, etc...)
# requires word border on both sides too
- name: aqua
pattern: 'aqua.*?'
word: true

View file

@ -1,7 +1,7 @@
{ {
"name": "mtproto_exporter", "name": "mtproto_exporter",
"type": "module", "type": "module",
"version": "1.1.0", "version": "1.2.0",
"packageManager": "pnpm@10.6.5", "packageManager": "pnpm@10.6.5",
"license": "MIT", "license": "MIT",
"scripts": { "scripts": {

View file

@ -1,5 +1,5 @@
import type { OptionDefinition } from "command-line-args"; import type { OptionDefinition } from "command-line-args";
import type { KeywordLike } from "./keywords.js"; import type { RawKeywordLike } from "./keywords.js";
import { readFile } from "node:fs/promises"; import { readFile } from "node:fs/promises";
import cmdline from "command-line-args"; import cmdline from "command-line-args";
import yaml from "js-yaml"; import yaml from "js-yaml";
@ -12,7 +12,7 @@ export interface Configuration {
watchFile: boolean; watchFile: boolean;
includePeers?: number[]; includePeers?: number[];
excludePeers?: number[]; excludePeers?: number[];
keywords?: KeywordLike[]; keywords?: RawKeywordLike[];
} }
const optionDefinitions: OptionDefinition[] = [ const optionDefinitions: OptionDefinition[] = [
@ -54,19 +54,22 @@ if (cli["exclude-peers"]) {
} }
} }
export async function readKeywords(filePath: string): Promise<KeywordLike[]> { export async function readKeywords(filePath: string): Promise<RawKeywordLike[]> {
const doc = yaml.load(await readFile(filePath, "utf8")) as { keywords?: any[] }; const doc = yaml.load(await readFile(filePath, "utf8")) as { keywords?: any[] };
if (doc.keywords && doc.keywords.constructor.name === "Array") { if (doc.keywords && doc.keywords.constructor.name === "Array") {
const keywords: KeywordLike[] = []; const keywords: RawKeywordLike[] = [];
for (const item of doc.keywords) { for (const item of doc.keywords) {
if (typeof item === "string") { if (typeof item === "string") {
keywords.push(item); keywords.push(item);
} else if (typeof item === "object" && item.name && item.pattern) { } else if (typeof item === "object" && typeof item.name === "string") {
keywords.push({ if (typeof item.pattern === "string") {
name: item.name, keywords.push({
pattern: new RegExp(item.pattern, "gi"), name: item.name,
}); pattern: item.pattern,
word: Boolean(item.word ?? false),
});
}
} }
} }
return keywords; return keywords;

View file

@ -3,37 +3,55 @@ import { PropagationAction } from "@mtcute/dispatcher";
import { Counter } from "prom-client"; import { Counter } from "prom-client";
import { config } from "./config.js"; import { config } from "./config.js";
import { peersConfigFilter } from "./filters.js"; import { peersConfigFilter } from "./filters.js";
import { escapeRegex } from "./utils.js";
interface KeywordPattern { export interface RawKeywordPattern {
name: string;
pattern: string;
word: boolean;
}
export type RawKeywordLike = string | RawKeywordPattern;
export interface KeywordPattern {
name: string; name: string;
pattern: RegExp; pattern: RegExp;
} }
export type KeywordLike = string | KeywordPattern; export function rawToPatterns(raw: RawKeywordLike[]): KeywordPattern[] {
const patterns: KeywordPattern[] = [];
for (const keyword of raw) {
let pattern;
let name;
let addBorders = false;
export function newWordsCounter(dp: Dispatcher) { if (typeof keyword === "string") {
const counter = new Counter({ pattern = escapeRegex(keyword);
name: "messenger_dialog_words_count", name = keyword;
help: "Number of words in messages since exporter startup", addBorders = true;
labelNames: ["peerId", "word"], } else {
}); pattern = keyword.pattern;
dp.onNewMessage(peersConfigFilter(config), async (msg) => { name = keyword.name;
const words = msg.text.toLowerCase().split(" "); addBorders = keyword.word;
for (const w of words) {
counter.inc({
peerId: msg.chat.id,
word: w,
});
} }
return PropagationAction.Continue;
}); const wordBorder = escapeRegex("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~");
return counter; const borderStart = addBorders ? `(?:[${wordBorder}\\s]|^)` : "";
const borderEnd = addBorders ? `(?:[${wordBorder}\\s]|$)` : "";
patterns.push({
name,
pattern: new RegExp(borderStart + pattern + borderEnd),
});
}
return patterns;
} }
export class KeywordsCounter extends Counter { export class KeywordsCounter extends Counter {
private _dp: Dispatcher; private _dp: Dispatcher;
private _keywords: KeywordLike[]; private _keywords: KeywordPattern[];
constructor(dp: Dispatcher, keywords: KeywordLike[] = []) { constructor(dp: Dispatcher, keywords: KeywordPattern[] = []) {
super({ super({
name: "messenger_dialog_keywords_count", name: "messenger_dialog_keywords_count",
help: "Number of keywords found in messages since exporter startup", help: "Number of keywords found in messages since exporter startup",
@ -42,24 +60,18 @@ export class KeywordsCounter extends Counter {
this._dp = dp; this._dp = dp;
this._keywords = keywords; this._keywords = keywords;
dp.onNewMessage(peersConfigFilter(config), async (msg) => { this._dp.onNewMessage(peersConfigFilter(config), async (msg) => {
for (const kw of this._keywords) { for (const kw of this._keywords) {
let count; const count = (msg.text.match(kw.pattern) ?? []).length;
let kwname;
if (typeof kw === "string") { // this will prevent from flooding metrics with keywords that had never been triggered yet
const words = msg.text.toLowerCase().split(" ");
count = words.filter(w => w === kw).length;
kwname = kw;
} else {
count = (msg.text.match(kw.pattern) || []).length;
kwname = kw.name;
}
if (count === 0) { if (count === 0) {
continue; continue;
} }
this.inc({ this.inc({
peerId: msg.chat.id, peerId: msg.chat.id,
keyword: kwname, keyword: kw.name,
}, count); }, count);
} }
return PropagationAction.Continue; return PropagationAction.Continue;
@ -70,7 +82,7 @@ export class KeywordsCounter extends Counter {
return this._keywords; return this._keywords;
} }
public setKeywords(keywords: KeywordLike[]) { public setKeywords(keywords: KeywordPattern[]) {
this._keywords = keywords; this._keywords = keywords;
} }
} }

View file

@ -5,6 +5,7 @@ import { collectDefaultMetrics, Registry } from "prom-client";
import { config, readKeywords } from "./config.js"; import { config, readKeywords } from "./config.js";
import * as env from "./env.js"; import * as env from "./env.js";
import { rawToPatterns } from "./keywords.js";
import * as metrics from "./metrics.js"; import * as metrics from "./metrics.js";
import MetricsServer from "./server.js"; import MetricsServer from "./server.js";
@ -36,7 +37,7 @@ registry.registerMetric(metrics.newUnreadCountGauge(tg));
registry.registerMetric(metrics.newMessagesCounter(dp)); registry.registerMetric(metrics.newMessagesCounter(dp));
if (config.keywords) { if (config.keywords) {
const counter = new metrics.KeywordsCounter(dp, config.keywords); const counter = new metrics.KeywordsCounter(dp, rawToPatterns(config.keywords));
registry.registerMetric(counter); registry.registerMetric(counter);
if (config.watchFile) { if (config.watchFile) {
@ -47,7 +48,7 @@ if (config.keywords) {
console.log("[watch-file] Keywords file was updated. Re-reading keywords configuration..."); console.log("[watch-file] Keywords file was updated. Re-reading keywords configuration...");
try { try {
config.keywords = await readKeywords(config.keywordsFile); config.keywords = await readKeywords(config.keywordsFile);
counter.setKeywords(config.keywords); counter.setKeywords(rawToPatterns(config.keywords));
} catch (e) { } catch (e) {
console.error("Failed to read keywords file", config.keywordsFile, e); console.error("Failed to read keywords file", config.keywordsFile, e);
} }

View file

@ -5,7 +5,7 @@ import { Counter, Gauge } from "prom-client";
import { config } from "./config.js"; import { config } from "./config.js";
import { peersConfigBoolFilter, peersConfigFilter } from "./filters.js"; import { peersConfigBoolFilter, peersConfigFilter } from "./filters.js";
import { KeywordsCounter, newWordsCounter } from "./keywords.js"; import { KeywordsCounter } from "./keywords.js";
function newMessagesCounter(dp: Dispatcher) { function newMessagesCounter(dp: Dispatcher) {
const counter = new Counter({ const counter = new Counter({
@ -67,6 +67,25 @@ function newUnreadCountGauge(tg: TelegramClient) {
return gauge; return gauge;
} }
function newWordsCounter(dp: Dispatcher) {
const counter = new Counter({
name: "messenger_dialog_words_count",
help: "Number of words in messages since exporter startup",
labelNames: ["peerId", "word"],
});
dp.onNewMessage(peersConfigFilter(config), async (msg) => {
const words = msg.text.toLowerCase().split(" ");
for (const w of words) {
counter.inc({
peerId: msg.chat.id,
word: w,
});
}
return PropagationAction.Continue;
});
return counter;
}
export { export {
KeywordsCounter, KeywordsCounter,
newMessagesCounter, newMessagesCounter,

3
src/utils.ts Normal file
View file

@ -0,0 +1,3 @@
export function escapeRegex(text: string) {
return text.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
}