Skip to content

Commit 3961d31

Browse files
committed
Add Prometheus metrics
1 parent 542552a commit 3961d31

File tree

7 files changed

+171
-1
lines changed

7 files changed

+171
-1
lines changed

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ strfry is a relay for the [nostr protocol](https://github.com/nostr-protocol/nos
1212
* Durable writes: The relay never returns an `OK` until an event has been confirmed as committed to the DB
1313
* Built-in support for real-time streaming (up/down/both) events from remote relays, and bulk import/export of events from/to jsonl files
1414
* [negentropy](https://github.com/hoytech/negentropy)-based set reconcilliation for efficient syncing with clients or between relays, accurate counting of events between relays, and more
15+
* Prometheus metrics endpoint for monitoring relay activity (client/relay messages by verb, events by kind)
1516

1617
If you are using strfry, please [join our telegram chat](https://t.me/strfry_users). Hopefully soon we'll migrate this to nostr.
1718

@@ -31,6 +32,8 @@ If you are using strfry, please [join our telegram chat](https://t.me/strfry_use
3132
* [Fried Exports](#fried-exports)
3233
* [Stream](#stream)
3334
* [Sync](#sync)
35+
* [Monitoring](#monitoring)
36+
* [Prometheus Metrics](#prometheus-metrics)
3437
* [Advanced](#advanced)
3538
* [DB Upgrade](#db-upgrade)
3639
* [DB Compaction](#db-compaction)
@@ -185,6 +188,34 @@ By default strfry keeps a precomputed BTree to speed up full-DB syncs. You can a
185188

186189

187190

191+
## Monitoring
192+
193+
### Prometheus Metrics
194+
195+
strfry includes built-in Prometheus metrics support for monitoring relay activity. Metrics are exposed via HTTP at the `/metrics` endpoint on the same port as the relay WebSocket server.
196+
197+
For example, if your relay is running on `localhost:7777`, you can access metrics at `http://localhost:7777/metrics`
198+
199+
The following metrics are available:
200+
201+
* **`nostr_client_messages_total{verb}`** - Total number of messages received from clients, broken down by verb (EVENT, REQ, CLOSE, NEG-OPEN, NEG-MSG, NEG-CLOSE)
202+
* **`nostr_relay_messages_total{verb}`** - Total number of messages sent to clients, broken down by verb (EVENT, OK, EOSE, NOTICE, NEG-MSG, NEG-ERR)
203+
* **`nostr_events_total{kind}`** - Total number of events processed, broken down by event kind (0, 1, 3, 4, etc.)
204+
205+
To scrape these metrics with Prometheus, add a job to your `prometheus.yml`:
206+
207+
```yaml
208+
scrape_configs:
209+
- job_name: 'strfry'
210+
static_configs:
211+
- targets: ['localhost:7777']
212+
metrics_path: '/metrics'
213+
```
214+
215+
See the [Prometheus metrics documentation](docs/prometheus-metrics.md) for detailed information and example Grafana queries.
216+
217+
218+
188219
## Advanced
189220
190221
### DB Upgrade

src/PrometheusMetrics.h

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#pragma once
2+
3+
#include <atomic>
4+
#include <string>
5+
#include <sstream>
6+
#include <map>
7+
#include <mutex>
8+
#include <shared_mutex>
9+
10+
// Simple thread-safe Prometheus metrics implementation
11+
// Supports counters with labels
12+
13+
class PrometheusMetrics {
14+
public:
15+
// Counter for tracking cumulative values
16+
class Counter {
17+
private:
18+
std::atomic<uint64_t> value{0};
19+
20+
public:
21+
void inc(uint64_t n = 1) {
22+
value.fetch_add(n, std::memory_order_relaxed);
23+
}
24+
25+
uint64_t get() const {
26+
return value.load(std::memory_order_relaxed);
27+
}
28+
};
29+
30+
// Labeled counter - allows multiple counters with different label values
31+
class LabeledCounter {
32+
private:
33+
mutable std::shared_mutex mutex;
34+
std::map<std::string, Counter> counters;
35+
36+
public:
37+
void inc(const std::string& label, uint64_t n = 1) {
38+
// Try read lock first for common case
39+
{
40+
std::shared_lock<std::shared_mutex> lock(mutex);
41+
auto it = counters.find(label);
42+
if (it != counters.end()) {
43+
it->second.inc(n);
44+
return;
45+
}
46+
}
47+
48+
// Need to create new counter
49+
std::unique_lock<std::shared_mutex> lock(mutex);
50+
counters[label].inc(n);
51+
}
52+
53+
std::map<std::string, uint64_t> getAll() const {
54+
std::shared_lock<std::shared_mutex> lock(mutex);
55+
std::map<std::string, uint64_t> result;
56+
for (const auto& [label, counter] : counters) {
57+
result[label] = counter.get();
58+
}
59+
return result;
60+
}
61+
};
62+
63+
// Singleton instance
64+
static PrometheusMetrics& getInstance() {
65+
static PrometheusMetrics instance;
66+
return instance;
67+
}
68+
69+
// Nostr client message counters (messages FROM clients TO relay)
70+
LabeledCounter nostrClientMessages;
71+
72+
// Nostr relay message counters (messages FROM relay TO clients)
73+
LabeledCounter nostrRelayMessages;
74+
75+
// Nostr event counters (by kind)
76+
LabeledCounter nostrEventsByKind;
77+
78+
// Generate Prometheus text format output
79+
std::string render() const {
80+
std::ostringstream out;
81+
82+
// Client messages
83+
out << "# HELP nostr_client_messages_total Total number of Nostr client messages by verb\n";
84+
out << "# TYPE nostr_client_messages_total counter\n";
85+
auto clientMsgs = nostrClientMessages.getAll();
86+
for (const auto& [verb, count] : clientMsgs) {
87+
out << "nostr_client_messages_total{verb=\"" << verb << "\"} " << count << "\n";
88+
}
89+
90+
// Relay messages
91+
out << "# HELP nostr_relay_messages_total Total number of Nostr relay messages by verb\n";
92+
out << "# TYPE nostr_relay_messages_total counter\n";
93+
auto relayMsgs = nostrRelayMessages.getAll();
94+
for (const auto& [verb, count] : relayMsgs) {
95+
out << "nostr_relay_messages_total{verb=\"" << verb << "\"} " << count << "\n";
96+
}
97+
98+
// Events by kind
99+
out << "# HELP nostr_events_total Total number of Nostr events by kind\n";
100+
out << "# TYPE nostr_events_total counter\n";
101+
auto events = nostrEventsByKind.getAll();
102+
for (const auto& [kind, count] : events) {
103+
out << "nostr_events_total{kind=\"" << kind << "\"} " << count << "\n";
104+
}
105+
106+
return out.str();
107+
}
108+
109+
private:
110+
PrometheusMetrics() = default;
111+
PrometheusMetrics(const PrometheusMetrics&) = delete;
112+
PrometheusMetrics& operator=(const PrometheusMetrics&) = delete;
113+
};
114+
115+
// Convenience macros for incrementing metrics
116+
#define PROM_INC_CLIENT_MSG(verb) PrometheusMetrics::getInstance().nostrClientMessages.inc(verb)
117+
#define PROM_INC_RELAY_MSG(verb) PrometheusMetrics::getInstance().nostrRelayMessages.inc(verb)
118+
#define PROM_INC_EVENT_KIND(kind) PrometheusMetrics::getInstance().nostrEventsByKind.inc(kind)

src/apps/relay/RelayIngester.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ void RelayServer::runIngester(ThreadPool<MsgIngester>::Thread &thr) {
2626
auto &cmd = jsonGetString(arr[0], "first element not a command like REQ");
2727

2828
if (cmd == "EVENT") {
29+
PROM_INC_CLIENT_MSG("EVENT");
2930
if (cfg().relay__logging__dumpInEvents) LI << "[" << msg->connId << "] dumpInEvent: " << msg->payload;
3031

3132
try {
@@ -36,6 +37,7 @@ void RelayServer::runIngester(ThreadPool<MsgIngester>::Thread &thr) {
3637
if (cfg().relay__logging__invalidEvents) LI << "Rejected invalid event: " << e.what();
3738
}
3839
} else if (cmd == "REQ") {
40+
PROM_INC_CLIENT_MSG("REQ");
3941
if (cfg().relay__logging__dumpInReqs) LI << "[" << msg->connId << "] dumpInReq: " << msg->payload;
4042

4143
try {
@@ -44,6 +46,7 @@ void RelayServer::runIngester(ThreadPool<MsgIngester>::Thread &thr) {
4446
sendNoticeError(msg->connId, std::string("bad req: ") + e.what());
4547
}
4648
} else if (cmd == "CLOSE") {
49+
PROM_INC_CLIENT_MSG("CLOSE");
4750
if (cfg().relay__logging__dumpInReqs) LI << "[" << msg->connId << "] dumpInReq: " << msg->payload;
4851

4952
try {
@@ -52,6 +55,7 @@ void RelayServer::runIngester(ThreadPool<MsgIngester>::Thread &thr) {
5255
sendNoticeError(msg->connId, std::string("bad close: ") + e.what());
5356
}
5457
} else if (cmd.starts_with("NEG-")) {
58+
PROM_INC_CLIENT_MSG(std::string(cmd));
5559
if (!cfg().relay__negentropy__enabled) throw herr("negentropy disabled");
5660

5761
try {
@@ -91,6 +95,9 @@ void RelayServer::ingesterProcessEvent(lmdb::txn &txn, uint64_t connId, std::str
9195
parseAndVerifyEvent(origJson, secpCtx, true, true, packedStr, jsonStr);
9296

9397
PackedEventView packed(packedStr);
98+
99+
// Track event kind metrics
100+
PROM_INC_EVENT_KIND(std::to_string(packed.kind()));
94101

95102
{
96103
bool foundProtected = false;

src/apps/relay/RelayNegentropy.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ void RelayServer::runNegentropy(ThreadPool<MsgNegentropy>::Thread &thr) {
101101
} catch (std::exception &e) {
102102
LI << "[" << connId << "] Error parsing negentropy message: " << e.what();
103103

104+
PROM_INC_RELAY_MSG("NEG-ERR");
104105
sendToConn(connId, tao::json::to_string(tao::json::value::array({
105106
"NEG-ERR",
106107
subId.str(),
@@ -111,6 +112,7 @@ void RelayServer::runNegentropy(ThreadPool<MsgNegentropy>::Thread &thr) {
111112
return;
112113
}
113114

115+
PROM_INC_RELAY_MSG("NEG-MSG");
114116
sendToConn(connId, tao::json::to_string(tao::json::value::array({
115117
"NEG-MSG",
116118
subId.str(),
@@ -146,6 +148,7 @@ void RelayServer::runNegentropy(ThreadPool<MsgNegentropy>::Thread &thr) {
146148
if (view->levIds.size() > cfg().relay__negentropy__maxSyncEvents) {
147149
LI << "[" << sub.connId << "] Negentropy query size exceeded " << cfg().relay__negentropy__maxSyncEvents;
148150

151+
PROM_INC_RELAY_MSG("NEG-ERR");
149152
sendToConn(sub.connId, tao::json::to_string(tao::json::value::array({
150153
"NEG-ERR",
151154
sub.subId.str(),
@@ -225,6 +228,7 @@ void RelayServer::runNegentropy(ThreadPool<MsgNegentropy>::Thread &thr) {
225228
} else if (auto msg = std::get_if<MsgNegentropy::NegMsg>(&newMsg.msg)) {
226229
auto *userView = views.findView(msg->connId, msg->subId);
227230
if (!userView) {
231+
PROM_INC_RELAY_MSG("NEG-ERR");
228232
sendToConn(msg->connId, tao::json::to_string(tao::json::value::array({
229233
"NEG-ERR",
230234
msg->subId.str(),

src/apps/relay/RelayReqWorker.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ void RelayServer::runReqWorker(ThreadPool<MsgReqWorker>::Thread &thr) {
1111
};
1212

1313
queries.onComplete = [&](lmdb::txn &, Subscription &sub){
14+
PROM_INC_RELAY_MSG("EOSE");
1415
sendToConn(sub.connId, tao::json::to_string(tao::json::value::array({ "EOSE", sub.subId.str() })));
1516
tpReqMonitor.dispatch(sub.connId, MsgReqMonitor{MsgReqMonitor::NewSub{std::move(sub)}});
1617
};

src/apps/relay/RelayServer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "filters.h"
1919
#include "jsonParseUtils.h"
2020
#include "Decompressor.h"
21+
#include "PrometheusMetrics.h"
2122

2223

2324

@@ -197,6 +198,7 @@ struct RelayServer {
197198
}
198199

199200
void sendEvent(uint64_t connId, const SubId &subId, std::string_view evJson) {
201+
PROM_INC_RELAY_MSG("EVENT");
200202
auto subIdSv = subId.sv();
201203

202204
std::string reply;
@@ -217,13 +219,15 @@ struct RelayServer {
217219
}
218220

219221
void sendNoticeError(uint64_t connId, std::string &&payload) {
222+
PROM_INC_RELAY_MSG("NOTICE");
220223
LI << "sending error to [" << connId << "]: " << payload;
221224
auto reply = tao::json::value::array({ "NOTICE", std::string("ERROR: ") + payload });
222225
tpWebsocket.dispatch(0, MsgWebsocket{MsgWebsocket::Send{connId, std::move(tao::json::to_string(reply))}});
223226
hubTrigger->send();
224227
}
225228

226229
void sendOKResponse(uint64_t connId, std::string_view eventIdHex, bool written, std::string_view message) {
230+
PROM_INC_RELAY_MSG("OK");
227231
auto reply = tao::json::value::array({ "OK", eventIdHex, written, message });
228232
tpWebsocket.dispatch(0, MsgWebsocket{MsgWebsocket::Send{connId, std::move(tao::json::to_string(reply))}});
229233
hubTrigger->send();

src/apps/relay/RelayWebsocket.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,11 @@ void RelayServer::runWebsocket(ThreadPool<MsgWebsocket>::Thread &thr) {
175175
std::string host = req.getHeader("host").toString();
176176
std::string url = req.getUrl().toString();
177177

178-
if (url == "/.well-known/nodeinfo") {
178+
if (url == "/metrics") {
179+
auto metrics = PrometheusMetrics::getInstance().render();
180+
auto response = preGenerateHttpResponse("text/plain; version=0.0.4", metrics);
181+
res->write(response.data(), response.size());
182+
} else if (url == "/.well-known/nodeinfo") {
179183
auto nodeInfo = getNodeInfoHttpResponse(host);
180184
res->write(nodeInfo.data(), nodeInfo.size());
181185
} else if (url == "/nodeinfo/2.1") {
@@ -291,6 +295,7 @@ void RelayServer::runWebsocket(ThreadPool<MsgWebsocket>::Thread &thr) {
291295
tempBuf += "]";
292296

293297
for (auto &item : msg->list) {
298+
PROM_INC_RELAY_MSG("EVENT");
294299
auto subIdSv = item.subId.sv();
295300
auto *p = tempBuf.data() + MAX_SUBID_SIZE - subIdSv.size();
296301
memcpy(p, "[\"EVENT\",\"", 10);

0 commit comments

Comments
 (0)