Skip to content

Commit 690c044

Browse files
authored
Improve metrics for broken routes (#321)
* Improve metrics for broken routes * Remove record skfs
1 parent d5c920a commit 690c044

File tree

5 files changed

+126
-94
lines changed

5 files changed

+126
-94
lines changed

include/hpr_metrics.hrl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
-ifdef(TEST).
22
-define(METRICS_TICK_INTERVAL, timer:seconds(1)).
33
-else.
4-
-define(METRICS_TICK_INTERVAL, timer:seconds(10)).
4+
-define(METRICS_TICK_INTERVAL, timer:seconds(30)).
55
-endif.
66
-define(METRICS_TICK, '_hpr_metrics_tick').
77

@@ -13,7 +13,7 @@
1313
-define(METRICS_ROUTES_GAUGE, "hpr_routes_gauge").
1414
-define(METRICS_EUI_PAIRS_GAUGE, "hpr_eui_pairs_gauge").
1515
-define(METRICS_SKFS_GAUGE, "hpr_skfs_gauge").
16-
-define(METRICS_WEIRD_ROUTES_GAUGE, "hpr_weird_routes_gauge").
16+
-define(METRICS_BROKEN_ROUTES_GAUGE, "hpr_broken_routes_gauge").
1717
-define(METRICS_PACKET_REPORT_HISTOGRAM, "hpr_packet_report_histogram").
1818
-define(METRICS_MULTI_BUY_GET_HISTOGRAM, "hpr_multi_buy_get_histogram").
1919
-define(METRICS_FIND_ROUTES_HISTOGRAM, "hpr_find_routes_histogram").
@@ -36,7 +36,7 @@
3636
{?METRICS_ROUTES_GAUGE, prometheus_gauge, [], "Number of Routes"},
3737
{?METRICS_EUI_PAIRS_GAUGE, prometheus_gauge, [], "Number of EUI Pairs"},
3838
{?METRICS_SKFS_GAUGE, prometheus_gauge, [], "Number of SKFs"},
39-
{?METRICS_WEIRD_ROUTES_GAUGE, prometheus_gauge, [], "Number of weird routes"},
39+
{?METRICS_BROKEN_ROUTES_GAUGE, prometheus_gauge, [oui], "Number of broken routes"},
4040
{?METRICS_PACKET_REPORT_HISTOGRAM, prometheus_histogram, [status], "Packet Reports"},
4141
{?METRICS_MULTI_BUY_GET_HISTOGRAM, prometheus_histogram, [status], "Multi Buy Service Get"},
4242
{?METRICS_FIND_ROUTES_HISTOGRAM, prometheus_histogram, [], "Find Routes"},

src/cli/hpr_cli_config.erl

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ config_usage() ->
5757
" [--display_skfs] default: false (SKFs not included)\n",
5858
"config route refresh_all - Refresh all routes\n",
5959
" [--minimum] default: 1 (need a minimum of 1 SKFs ro be updated)\n",
60+
"config route refresh_broken - Refresh broken routes\n",
6061
"config route refresh <route_id> - Refresh route's EUIs, SKFs, DevAddrRanges\n",
6162
"config route activate <route_id> - Activate route\n",
6263
"config route deactivate <route_id> - Deactivate route\n",
@@ -103,6 +104,12 @@ config_cmd() ->
103104
],
104105
fun config_route_refresh_all/3
105106
],
107+
[
108+
["config", "route", "refresh_broken"],
109+
[],
110+
[],
111+
fun config_route_refresh_broken/3
112+
],
106113
[
107114
["config", "route", "refresh", '*'],
108115
[],
@@ -299,44 +306,78 @@ config_route_refresh_all(["config", "route", "refresh_all"], [], Flags) ->
299306
RouteIDs = [ID || {ID, _} <- Sorted],
300307
Total = erlang:length(RouteIDs),
301308
lager:info("Found ~p routes to update", [Total]),
302-
route_refresh_all(
303-
Total, RouteIDs, backoff:init(timer:seconds(1), timer:minutes(1))
304-
)
309+
routes_refresh(Total, RouteIDs)
305310
end),
306-
c_text("command spawned @ ~p, look at logs and tail hpr_cli_config", [Pid]);
311+
c_text(
312+
"command spawned @ ~p, look at logs and `tail -F /opt/hpr/log/info.log | grep hpr_cli_config`",
313+
[Pid]
314+
);
307315
config_route_refresh_all(_, _, _) ->
308316
usage.
309317

310-
route_refresh_all(_Total, [], _Backoff0) ->
318+
config_route_refresh_broken(["config", "route", "refresh_broken"], [], _Flags) ->
319+
Pid = erlang:spawn(fun() ->
320+
RouteIDsWithDevAddr =
321+
hpr_devaddr_range_storage:foldl(
322+
fun({_, RouteID}, Acc) ->
323+
sets:add_element(RouteID, Acc)
324+
end,
325+
sets:new()
326+
),
327+
RouteIDs = hpr_route_storage:foldl(
328+
fun(RouteETS, RouteIDs) ->
329+
SKFCount =
330+
case ets:info(hpr_route_ets:skf_ets(RouteETS), size) of
331+
undefined -> 0;
332+
N -> N
333+
end,
334+
Route = hpr_route_ets:route(RouteETS),
335+
RouteID = hpr_route:id(Route),
336+
case SKFCount > 0 andalso not sets:is_element(RouteID, RouteIDsWithDevAddr) of
337+
true ->
338+
lager:warning(
339+
[{route_id, RouteID}, {oui, hpr_route:oui(Route)}],
340+
"BROKEN_ROUTES route has no devaddr ranges but has (~p) skfs",
341+
[SKFCount]
342+
),
343+
[RouteID | RouteIDs];
344+
false ->
345+
RouteIDs
346+
end
347+
end,
348+
[]
349+
),
350+
Total = erlang:length(RouteIDs),
351+
lager:info("Found ~p routes to fix", [Total]),
352+
routes_refresh(Total, RouteIDs)
353+
end),
354+
c_text(
355+
"command spawned @ ~p, look at logs and `tail -F /opt/hpr/log/info.log | grep hpr_cli_config`",
356+
[Pid]
357+
);
358+
config_route_refresh_broken(_, _, _) ->
359+
usage.
360+
361+
routes_refresh(_Total, []) ->
311362
lager:info("All done!");
312-
route_refresh_all(Total, [RouteID | T] = RouteIDs, Backoff0) ->
363+
routes_refresh(Total, [RouteID | RouteIDs]) ->
313364
CurrTotal = erlang:length(RouteIDs),
314365
IDX = Total - CurrTotal + 1,
315366
lager:info("~p/~p===== ~s =====", [
316367
IDX, Total, RouteID
317368
]),
318369
Start = erlang:system_time(millisecond),
319-
case try_refresh_route(RouteID) of
320-
ok ->
321-
End = erlang:system_time(millisecond),
322-
lager:info("took ~pms", [End - Start]),
323-
{_, Backoff1} = backoff:succeed(Backoff0),
324-
route_refresh_all(Total, T, Backoff1);
325-
error ->
326-
End = erlang:system_time(millisecond),
327-
lager:info("took ~pms", [End - Start]),
328-
{Delay, Backoff1} = backoff:fail(Backoff0),
329-
lager:info("sleeping ~pms", [Delay]),
330-
timer:sleep(Delay),
331-
route_refresh_all(Total, RouteIDs, Backoff1)
332-
end,
333-
ok.
334-
335-
try_refresh_route(RouteID) ->
336-
try hpr_route_stream_worker:refresh_route(RouteID) of
370+
try hpr_route_stream_worker:refresh_route(RouteID, 3) of
337371
{ok, Map} ->
338372
lager:info("| Type | Before | After | Removed | Added |"),
339373
lager:info("|------|---------|---------|---------|---------|"),
374+
lager:info("| ~4w | ~7w | ~7w | ~7w | ~7w |", [
375+
addr,
376+
maps:get(devaddr_before, Map),
377+
maps:get(devaddr_after, Map),
378+
maps:get(devaddr_removed, Map),
379+
maps:get(devaddr_added, Map)
380+
]),
340381
lager:info("| ~4w | ~7w | ~7w | ~7w | ~7w |", [
341382
eui,
342383
maps:get(eui_before, Map),
@@ -350,23 +391,16 @@ try_refresh_route(RouteID) ->
350391
maps:get(skf_after, Map),
351392
maps:get(skf_removed, Map),
352393
maps:get(skf_added, Map)
353-
]),
354-
lager:info("| ~4w | ~7w | ~7w | ~7w | ~7w |", [
355-
addr,
356-
maps:get(devaddr_before, Map),
357-
maps:get(devaddr_after, Map),
358-
maps:get(devaddr_removed, Map),
359-
maps:get(devaddr_added, Map)
360-
]),
361-
ok;
394+
]);
362395
{error, _R} ->
363-
lager:info("ERROR ~p", [_R]),
364-
error
396+
lager:error("error ~p", [_R])
365397
catch
366398
_E:_R ->
367-
lager:info("CRASHED ~p", [_R]),
368-
error
369-
end.
399+
lager:critical("crashed ~p", [_R])
400+
end,
401+
End = erlang:system_time(millisecond),
402+
lager:info("took ~pms", [End - Start]),
403+
routes_refresh(Total, RouteIDs).
370404

371405
config_route_refresh(["config", "route", "refresh", RouteID], [], _Flags) ->
372406
case hpr_route_stream_worker:refresh_route(RouteID, 3) of

src/grpc/iot_config/hpr_devaddr_range_storage.erl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
init_ets/0,
55
checkpoint/0,
66

7+
foldl/2,
78
lookup/1,
89
insert/1,
910
delete/1,
@@ -38,6 +39,10 @@ checkpoint() ->
3839
ok = dets:from_ets(?DETS, ?ETS)
3940
end).
4041

42+
-spec foldl(Fun :: function(), Acc :: any()) -> any().
43+
foldl(Fun, Acc) ->
44+
ets:foldl(Fun, Acc, ?ETS).
45+
4146
-spec lookup(DevAddr :: non_neg_integer()) -> [hpr_route_ets:route()].
4247
lookup(DevAddr) ->
4348
MS = [

src/grpc/iot_config/hpr_route_storage.erl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
init_ets/0,
55
checkpoint/0,
66

7+
foldl/2,
78
insert/1, insert/2, insert/3,
89
delete/1,
910
lookup/1,
@@ -60,6 +61,10 @@ lookup(ID) ->
6061
{error, not_found}
6162
end.
6263

64+
-spec foldl(Fun :: function(), Acc :: any()) -> any().
65+
foldl(Fun, Acc) ->
66+
ets:foldl(Fun, Acc, ?ETS).
67+
6368
-spec insert(Route :: hpr_route:route()) -> ok.
6469
insert(Route) ->
6570
RouteID = hpr_route:id(Route),

src/metrics/hpr_metrics.erl

Lines changed: 41 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,6 @@ handle_info(?METRICS_TICK, State) ->
186186
fun record_grpc_connections/0,
187187
fun record_routes/0,
188188
fun record_eui_pairs/0,
189-
fun record_skfs/0,
190-
fun record_weird_routes/0,
191189
fun record_ets/0,
192190
fun record_queues/0,
193191
fun record_devices/0
@@ -246,12 +244,47 @@ declare_metrics() ->
246244

247245
-spec record_routes() -> ok.
248246
record_routes() ->
249-
case ets:info(hpr_routes_ets, size) of
250-
undefined ->
251-
_ = prometheus_gauge:set(?METRICS_ROUTES_GAUGE, [], 0);
252-
N ->
253-
_ = prometheus_gauge:set(?METRICS_ROUTES_GAUGE, [], N)
254-
end,
247+
RouteIDsWithDevAddr =
248+
hpr_devaddr_range_storage:foldl(
249+
fun({_, RouteID}, Acc) ->
250+
sets:add_element(RouteID, Acc)
251+
end,
252+
sets:new()
253+
),
254+
{RoutesCount, SKFsCount, BrokenMap} = hpr_route_storage:foldl(
255+
fun(RouteETS, {RoutesCount, SKFsCount, BrokenMap}) ->
256+
SKFCount =
257+
case ets:info(hpr_route_ets:skf_ets(RouteETS), size) of
258+
undefined -> 0;
259+
N -> N
260+
end,
261+
Route = hpr_route_ets:route(RouteETS),
262+
RouteID = hpr_route:id(Route),
263+
OUI = hpr_route:oui(Route),
264+
NewBrokenMap =
265+
case SKFCount > 0 andalso not sets:is_element(RouteID, RouteIDsWithDevAddr) of
266+
true ->
267+
lager:warning(
268+
[{route_id, RouteID}, {oui, OUI}],
269+
"BROKEN_ROUTES route has no devaddr ranges but has (~p) skfs",
270+
[SKFCount]
271+
),
272+
maps:update_with(OUI, fun(Count) -> Count + 1 end, 1, BrokenMap);
273+
false ->
274+
BrokenMap
275+
end,
276+
{RoutesCount + 1, SKFsCount + SKFCount, NewBrokenMap}
277+
end,
278+
{0, 0, #{}}
279+
),
280+
_ = prometheus_gauge:set(?METRICS_ROUTES_GAUGE, [], RoutesCount),
281+
_ = prometheus_gauge:set(?METRICS_SKFS_GAUGE, [], SKFsCount),
282+
maps:foreach(
283+
fun(OUI, Count) ->
284+
_ = prometheus_gauge:set(?METRICS_BROKEN_ROUTES_GAUGE, [OUI], Count)
285+
end,
286+
BrokenMap
287+
),
255288
ok.
256289

257290
-spec record_eui_pairs() -> ok.
@@ -264,51 +297,6 @@ record_eui_pairs() ->
264297
end,
265298
ok.
266299

267-
-spec record_skfs() -> ok.
268-
record_skfs() ->
269-
Count = lists:foldl(
270-
fun(RouteETS, Acc) ->
271-
case ets:info(hpr_route_ets:skf_ets(RouteETS), size) of
272-
undefined -> Acc;
273-
N -> N + Acc
274-
end
275-
end,
276-
0,
277-
ets:tab2list(hpr_routes_ets)
278-
),
279-
_ = prometheus_gauge:set(?METRICS_SKFS_GAUGE, [], Count),
280-
ok.
281-
282-
-spec record_weird_routes() -> ok.
283-
record_weird_routes() ->
284-
Count = lists:foldl(
285-
fun(RouteETS, Acc) ->
286-
Route = hpr_route_ets:route(RouteETS),
287-
RouteID = hpr_route:id(Route),
288-
SKFCount =
289-
case ets:info(hpr_route_ets:skf_ets(RouteETS), size) of
290-
undefined -> 0;
291-
N -> N
292-
end,
293-
DevAddrRangesCount = hpr_devaddr_range_storage:count_for_route(RouteID),
294-
case SKFCount > 0 andalso DevAddrRangesCount == 0 of
295-
true ->
296-
lager:critical(
297-
[{route_id, RouteID}, {oui, hpr_route:oui(Route)}],
298-
"route has no devaddr ranges but has (~p) skfs",
299-
[SKFCount]
300-
),
301-
Acc + 1;
302-
false ->
303-
Acc
304-
end
305-
end,
306-
0,
307-
ets:tab2list(hpr_routes_ets)
308-
),
309-
_ = prometheus_gauge:set(?METRICS_WEIRD_ROUTES_GAUGE, [], Count),
310-
ok.
311-
312300
-spec record_grpc_connections() -> ok.
313301
record_grpc_connections() ->
314302
Opts = application:get_env(grpcbox, listen_opts, #{}),

0 commit comments

Comments
 (0)