add better handling of word patterns

This commit is contained in:
soffee 2025-04-08 21:33:12 +03:00
parent 69695e265a
commit fb5fd57238
7 changed files with 99 additions and 46 deletions

View file

@ -1,4 +1,19 @@
keywords:
# will match just word 'meow' (requires word border on both sides)
- meow
# will match plain regex
- name: woof
pattern: 'w[oa]+f'
# will match regex wraped with word borders
# will match 'hi, woof :3', 'woof!', 'i heard a woof' but not 'i like subwoofers'
- name: woof
pattern: 'w[oa]+f'
word: true
# will match any word starting with 'aqua' (aquarium, aquatic, aquaculture, etc...)
# requires word border on both sides too
- name: aqua
pattern: 'aqua.*?'
word: true

View file

@ -1,7 +1,7 @@
{
"name": "mtproto_exporter",
"type": "module",
"version": "1.1.0",
"version": "1.2.0",
"packageManager": "pnpm@10.6.5",
"license": "MIT",
"scripts": {

View file

@ -1,5 +1,5 @@
import type { OptionDefinition } from "command-line-args";
import type { KeywordLike } from "./keywords.js";
import type { RawKeywordLike } from "./keywords.js";
import { readFile } from "node:fs/promises";
import cmdline from "command-line-args";
import yaml from "js-yaml";
@ -12,7 +12,7 @@ export interface Configuration {
watchFile: boolean;
includePeers?: number[];
excludePeers?: number[];
keywords?: KeywordLike[];
keywords?: RawKeywordLike[];
}
const optionDefinitions: OptionDefinition[] = [
@ -54,19 +54,22 @@ if (cli["exclude-peers"]) {
}
}
export async function readKeywords(filePath: string): Promise<KeywordLike[]> {
export async function readKeywords(filePath: string): Promise<RawKeywordLike[]> {
const doc = yaml.load(await readFile(filePath, "utf8")) as { keywords?: any[] };
if (doc.keywords && doc.keywords.constructor.name === "Array") {
const keywords: KeywordLike[] = [];
const keywords: RawKeywordLike[] = [];
for (const item of doc.keywords) {
if (typeof item === "string") {
keywords.push(item);
} else if (typeof item === "object" && item.name && item.pattern) {
keywords.push({
name: item.name,
pattern: new RegExp(item.pattern, "gi"),
});
} else if (typeof item === "object" && typeof item.name === "string") {
if (typeof item.pattern === "string") {
keywords.push({
name: item.name,
pattern: item.pattern,
word: Boolean(item.word ?? false),
});
}
}
}
return keywords;

View file

@ -3,37 +3,55 @@ import { PropagationAction } from "@mtcute/dispatcher";
import { Counter } from "prom-client";
import { config } from "./config.js";
import { peersConfigFilter } from "./filters.js";
import { escapeRegex } from "./utils.js";
interface KeywordPattern {
export interface RawKeywordPattern {
name: string;
pattern: string;
word: boolean;
}
export type RawKeywordLike = string | RawKeywordPattern;
export interface KeywordPattern {
name: string;
pattern: RegExp;
}
export type KeywordLike = string | KeywordPattern;
export function rawToPatterns(raw: RawKeywordLike[]): KeywordPattern[] {
const patterns: KeywordPattern[] = [];
for (const keyword of raw) {
let pattern;
let name;
let addBorders = false;
export function newWordsCounter(dp: Dispatcher) {
const counter = new Counter({
name: "messenger_dialog_words_count",
help: "Number of words in messages since exporter startup",
labelNames: ["peerId", "word"],
});
dp.onNewMessage(peersConfigFilter(config), async (msg) => {
const words = msg.text.toLowerCase().split(" ");
for (const w of words) {
counter.inc({
peerId: msg.chat.id,
word: w,
});
if (typeof keyword === "string") {
pattern = escapeRegex(keyword);
name = keyword;
addBorders = true;
} else {
pattern = keyword.pattern;
name = keyword.name;
addBorders = keyword.word;
}
return PropagationAction.Continue;
});
return counter;
const wordBorder = escapeRegex("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~");
const borderStart = addBorders ? `(?:[${wordBorder}\\s]|^)` : "";
const borderEnd = addBorders ? `(?:[${wordBorder}\\s]|$)` : "";
patterns.push({
name,
pattern: new RegExp(borderStart + pattern + borderEnd),
});
}
return patterns;
}
export class KeywordsCounter extends Counter {
private _dp: Dispatcher;
private _keywords: KeywordLike[];
constructor(dp: Dispatcher, keywords: KeywordLike[] = []) {
private _keywords: KeywordPattern[];
constructor(dp: Dispatcher, keywords: KeywordPattern[] = []) {
super({
name: "messenger_dialog_keywords_count",
help: "Number of keywords found in messages since exporter startup",
@ -42,24 +60,18 @@ export class KeywordsCounter extends Counter {
this._dp = dp;
this._keywords = keywords;
dp.onNewMessage(peersConfigFilter(config), async (msg) => {
this._dp.onNewMessage(peersConfigFilter(config), async (msg) => {
for (const kw of this._keywords) {
let count;
let kwname;
if (typeof kw === "string") {
const words = msg.text.toLowerCase().split(" ");
count = words.filter(w => w === kw).length;
kwname = kw;
} else {
count = (msg.text.match(kw.pattern) || []).length;
kwname = kw.name;
}
const count = (msg.text.match(kw.pattern) ?? []).length;
// this will prevent from flooding metrics with keywords that had never been triggered yet
if (count === 0) {
continue;
}
this.inc({
peerId: msg.chat.id,
keyword: kwname,
keyword: kw.name,
}, count);
}
return PropagationAction.Continue;
@ -70,7 +82,7 @@ export class KeywordsCounter extends Counter {
return this._keywords;
}
public setKeywords(keywords: KeywordLike[]) {
public setKeywords(keywords: KeywordPattern[]) {
this._keywords = keywords;
}
}

View file

@ -5,6 +5,7 @@ import { collectDefaultMetrics, Registry } from "prom-client";
import { config, readKeywords } from "./config.js";
import * as env from "./env.js";
import { rawToPatterns } from "./keywords.js";
import * as metrics from "./metrics.js";
import MetricsServer from "./server.js";
@ -36,7 +37,7 @@ registry.registerMetric(metrics.newUnreadCountGauge(tg));
registry.registerMetric(metrics.newMessagesCounter(dp));
if (config.keywords) {
const counter = new metrics.KeywordsCounter(dp, config.keywords);
const counter = new metrics.KeywordsCounter(dp, rawToPatterns(config.keywords));
registry.registerMetric(counter);
if (config.watchFile) {
@ -47,7 +48,7 @@ if (config.keywords) {
console.log("[watch-file] Keywords file was updated. Re-reading keywords configuration...");
try {
config.keywords = await readKeywords(config.keywordsFile);
counter.setKeywords(config.keywords);
counter.setKeywords(rawToPatterns(config.keywords));
} catch (e) {
console.error("Failed to read keywords file", config.keywordsFile, e);
}

View file

@ -5,7 +5,7 @@ import { Counter, Gauge } from "prom-client";
import { config } from "./config.js";
import { peersConfigBoolFilter, peersConfigFilter } from "./filters.js";
import { KeywordsCounter, newWordsCounter } from "./keywords.js";
import { KeywordsCounter } from "./keywords.js";
function newMessagesCounter(dp: Dispatcher) {
const counter = new Counter({
@ -67,6 +67,25 @@ function newUnreadCountGauge(tg: TelegramClient) {
return gauge;
}
function newWordsCounter(dp: Dispatcher) {
const counter = new Counter({
name: "messenger_dialog_words_count",
help: "Number of words in messages since exporter startup",
labelNames: ["peerId", "word"],
});
dp.onNewMessage(peersConfigFilter(config), async (msg) => {
const words = msg.text.toLowerCase().split(" ");
for (const w of words) {
counter.inc({
peerId: msg.chat.id,
word: w,
});
}
return PropagationAction.Continue;
});
return counter;
}
export {
KeywordsCounter,
newMessagesCounter,

3
src/utils.ts Normal file
View file

@ -0,0 +1,3 @@
export function escapeRegex(text: string) {
return text.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
}