IndustrialDeviceController/Software/HighLevelApp/iot/diag.c (641 lines of code) (raw):
/* Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License. */
#include <math.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <inttypes.h>
#include <applibs/applications.h>
#include <applibs/log.h>
#include <applibs/networking.h>
#include <applibs/storage.h>
#include <applibs/powermanagement.h>
#include <init/globals.h>
#include <init/adapter.h>
#include <iot/diag.h>
#include <utils/event_loop_timer.h>
#include <iot/iot.h>
#include <utils/llog.h>
#include <utils/network.h>
#include <utils/timer.h>
#include <utils/utils.h>
#include <utils/led.h>
#include <safeclib/safe_lib.h>
#include <frozen/frozen.h>
//#include "watchdog.h"
#define MAX_EVENT_IN_MEMORY 500
#define MAX_MAC_ADDRESS_SIZE 20
const char EVENT_FILE_MAGIC[8] = {'D', 'I', 'A', 'G', ' ', 'V', '0', '4'};
typedef struct event_t event_t;
struct event_t {
struct timespec ts;
uint16_t code;
uint16_t repeat;
};
typedef struct diag_value_t diag_value_t;
struct diag_value_t {
char *key;
double value;
diag_value_t *next;
};
typedef struct event_file_hdr_t event_file_hdr_t;
struct event_file_hdr_t {
char magic[8];
int32_t size;
};
typedef struct diag_t diag_t;
struct diag_t {
pthread_mutex_t lock;
diag_value_t *values;
event_loop_timer_t *heartbeat_timer;
event_loop_timer_t *report_events_timer;
event_loop_timer_t *report_twins_timer;
event_loop_timer_t *report_telemetry_timer;
event_loop_timer_t *report_log_timer;
event_loop_timer_t *led_update_timer;
struct timespec ts_app_start;
struct timespec ts_last_d2c;
event_t events[MAX_EVENT_IN_MEMORY];
int num_events;
char *reported_device_twin;
EventLoop *eloop;
};
static diag_t s_diag;
// diag telemetry
static int printf_diag_points(struct json_out *out, va_list *ap)
{
diag_value_t *values = va_arg(*ap, struct diag_value_t *);
int len = json_printf(out, "[");
for (diag_value_t *point = values; point; point = point->next) {
if (point != values) {
len += json_printf(out, ",");
}
len += json_printf(out, "[%Q,\"%.0f\"]", point->key, point->value);
}
len += json_printf(out, "]");
return len;
}
static char *build_diag_telemetry_message(void)
{
char *iot_message = json_asprintf("{timestamp:%Q,name:%Q,location:%Q,point:%M}",
timespec2str(now()),
adapter_get_name(),
adapter_get_location(),
printf_diag_points, s_diag.values);
return iot_message;
}
static void diag_telemetry_delivered(bool delivered, void *context)
{
if (delivered) {
clock_gettime(CLOCK_BOOTTIME, &s_diag.ts_last_d2c);
}
}
static void diag_report_telemetry(void)
{
char *iot_message = build_diag_telemetry_message();
LOGI("REPORT-TELEMETRY");
if (iot_send_message_async(iot_message, IOT_MESSAGE_TYPE_DIAG_TELEMETRY, diag_telemetry_delivered, NULL) != 0) {
LOGE("Failed to report telemetry");
}
FREE(iot_message);
}
static void diag_report_telemetry_cb(void *context)
{
if (network_is_connected() && iot_is_connected()) {
diag_report_telemetry();
}
}
static diag_value_t* find_diag_value(const char* key)
{
for (diag_value_t *p = s_diag.values; p; p = p->next) {
if (strcmp(p->key, key) == 0) {
return p;
}
}
return NULL;
}
// diag events
//1/1/2010, any time stamp before this been regraded as invalid RTC for our solution
#define RESONABLE_START_TIME 1262304000
static int printf_events(struct json_out *out, va_list *ap)
{
event_t *events = va_arg(*ap, struct event_t *);
int nevents = va_arg(*ap, int);
int len = 0;
for (int i = 0; i < nevents; i++) {
if (len > 0) {
len += json_printf(out, ",");
}
if (events[i].ts.tv_sec > RESONABLE_START_TIME) {
int64_t epoch_ms = SPEC2MS(events[i].ts);
len += json_printf(out, "[%llu, %d, %d]", epoch_ms, events[i].code & 0x7FFF, events[i].repeat);
} else {
// The time is get before time sync, it's not RTC, but power on seconds + epoch of 1/1/20000
// ignore it for now unless we figure out a way to fix it
fprintf(stderr, "skip unknown event\n");
}
}
return len;
}
static void remove_events_upto(struct timespec *ts)
{
int nremove = 0;
for (int i = 0; i < s_diag.num_events; i++) {
if (timespec_compare(&s_diag.events[i].ts, ts) > 0) break;
nremove++;
}
if (nremove > 0) {
memmove(s_diag.events, s_diag.events + nremove, (s_diag.num_events - nremove) * sizeof(event_t));
s_diag.num_events -= nremove;
}
}
static void event_message_delivered(bool delivered, void *context)
{
struct timespec *last_event_ts = (struct timespec*)context;
if (delivered) {
LOGD("Events reported");
remove_events_upto(last_event_ts);
} else {
LOGD("Diag events report failed");
}
FREE(last_event_ts);
}
static int save_events_to_file(void)
{
if (s_diag.num_events <= 0) return 0;
event_file_hdr_t hdr;
memcpy_s(hdr.magic, sizeof(EVENT_FILE_MAGIC), EVENT_FILE_MAGIC, sizeof(EVENT_FILE_MAGIC));
hdr.size = s_diag.num_events * sizeof(event_t);
int fd = Storage_OpenMutableFile();
if (fd < 0) {
LOGE("can't open mutable file");
return -1;
}
lseek(fd, EVENT_FILE_OFFSET, SEEK_SET);
write(fd, &hdr, sizeof(event_file_hdr_t));
write(fd, s_diag.events, hdr.size);
s_diag.num_events = 0;
fsync(fd);
close(fd);
return 0;
}
static void log_event_internal(event_code_t code, bool flush)
{
// avoid reentry
if (pthread_mutex_trylock(&s_diag.lock) != 0) return;
if ((s_diag.num_events > 0) && (code == s_diag.events[s_diag.num_events - 1].code)) {
s_diag.events[s_diag.num_events - 1].repeat++;
clock_gettime(CLOCK_REALTIME, &s_diag.events[s_diag.num_events - 1].ts);
LOGD("log repeat event: [%d %d]", code, s_diag.events[s_diag.num_events - 1].repeat);
} else {
if (s_diag.num_events < MAX_EVENT_IN_MEMORY) {
s_diag.events[s_diag.num_events].code = code;
s_diag.events[s_diag.num_events].repeat = 1;
clock_gettime(CLOCK_REALTIME, &s_diag.events[s_diag.num_events].ts);
s_diag.num_events++;
LOGD("log new event: [%d %d]", code, s_diag.events[s_diag.num_events - 1].repeat);
} else {
LOGD("event buffer full! ignore new event");
}
}
if (flush) {
save_events_to_file();
}
pthread_mutex_unlock(&s_diag.lock);
}
static void diag_report_events(void)
{
if (s_diag.num_events <= 0) return;
char *iot_message = json_asprintf("[%M]", printf_events, s_diag.events, s_diag.num_events);
LOGI("DIAG EVENTS: %s", iot_message);
struct timespec *ts = MALLOC(sizeof(struct timespec));
*ts = s_diag.events[s_diag.num_events-1].ts;
if (iot_send_message_async(iot_message,
IOT_MESSAGE_TYPE_DIAG_EVENTS, event_message_delivered, ts) == 0) {
// a bit tracky here, flip MSB of event to we won't try to increase repeat number for event already inflight
s_diag.events[s_diag.num_events-1].code |= 0x8000;
} else {
FREE(ts);
}
FREE(iot_message);
}
static void diag_report_events_cb(void *context)
{
if (network_is_connected() && iot_is_connected()) {
diag_report_events();
}
}
static void clear_event_file(int fd)
{
event_file_hdr_t hdr;
memcpy_s(hdr.magic, sizeof(EVENT_FILE_MAGIC), EVENT_FILE_MAGIC, sizeof(EVENT_FILE_MAGIC));
hdr.size = 0;
lseek(fd, EVENT_FILE_OFFSET, SEEK_SET);
write(fd, &hdr, sizeof(event_file_hdr_t));
fdatasync(fd);
}
// return number of event loaded from file
static int load_event_from_file(int fd)
{
event_file_hdr_t hdr;
if (lseek(fd, EVENT_FILE_OFFSET, SEEK_SET) != EVENT_FILE_OFFSET) {
LOGW("event file not exist");
return 0;
}
if (read(fd, &hdr, sizeof(event_file_hdr_t)) != sizeof(event_file_hdr_t)) {
LOGW("Failed to read event file hdr");
return 0;
}
if (memcmp(hdr.magic, EVENT_FILE_MAGIC, sizeof(EVENT_FILE_MAGIC)) != 0) {
LOGW("event file magic mismatch");
return 0;
}
if ((hdr.size < 0) || (hdr.size > EVENT_FILE_SIZE) || (hdr.size % sizeof(event_t) != 0)) {
LOGW("event file size incorrect");
return 0;
}
if (read(fd, s_diag.events, hdr.size) != hdr.size) {
LOGW("Failed to load event from file");
return 0;
}
s_diag.num_events = hdr.size/sizeof(event_t);
LOGD("loaded %d event from file", s_diag.num_events);
return s_diag.num_events;
}
static int network_status_to_event(Networking_InterfaceConnectionStatus status)
{
if (status & Networking_InterfaceConnectionStatus_ConnectedToInternet) {
return EVENT_NETWORK_INTERNET;
} else if (status & Networking_InterfaceConnectionStatus_IpAvailable) {
return EVENT_NETWORK_IP_AVAILABLE;
} else if (status & Networking_InterfaceConnectionStatus_ConnectedToNetwork) {
return EVENT_NETWORK_LOCAL;
} else if (status & Networking_InterfaceConnectionStatus_InterfaceUp) {
return EVENT_NETWORK_INTERFACE_UP;
} else {
return EVENT_NETWORK_NO_INTERFACE;
}
}
// log
static void diag_report_log_cb(void *context)
{
if (llog_remote_log_enabled() && network_is_connected() && iot_is_connected()) {
llog_upload();
}
}
// device twin
static int32_t printf_provisions(struct json_out *out, va_list *ap)
{
int32_t num = 0;
int32_t len = json_printf(out, "{");
for (ce_device_t *device = va_arg(*ap, struct ce_device_t *); device; device = device->next) {
if (num++ > 0) {
len += json_printf(out, ",");
}
len += json_printf(out, "%Q:%d", device->name, device->err);
}
len += json_printf(out, "}");
return len;
}
static char *build_device_twin_to_report(void)
{
if (!adapter_get_source_id()) return NULL;
char wifiMac[MAX_MAC_ADDRESS_SIZE];
char ethMac[MAX_MAC_ADDRESS_SIZE];
network_get_mac("wlan0", wifiMac, sizeof(wifiMac));
network_get_mac("eth0", ethMac, sizeof(ethMac));
bool wifi_connected = network_is_interface_connected("wlan0");
bool eth_connected = network_is_interface_connected("eth0");
struct timespec ts_boot = {.tv_sec = 0, .tv_nsec = 0};
int64_t boot_epoch = timespec2epoch(boottime2realtime(ts_boot));
int64_t app_start_epoch = timespec2epoch(boottime2realtime(s_diag.ts_app_start));
int64_t online_epoch = timespec2epoch(boottime2realtime(iot_last_online()));
int64_t offline_epoch = timespec2epoch(boottime2realtime(iot_last_offline()));
int64_t provisioned_epoch = timespec2epoch(boottime2realtime(adapter_last_provisioned()));
char *twin = json_asprintf("{"
"name:%Q,"
"sourceId:%Q,"
"firmwareVersion:%Q,"
"lastBoot:%"PRId64","
"lastAppStart:%"PRId64","
"lastOnline:%"PRId64","
"lastOffline:%"PRId64","
"lastProvision:%"PRId64","
"wifiMac:%Q,"
"ethMac:%Q,"
"wifiConnected:%B,"
"ethConnected:%B,"
"provision:%M"
"}",
adapter_get_name(),
adapter_get_source_id(),
app_version(),
boot_epoch,
app_start_epoch,
online_epoch,
offline_epoch,
provisioned_epoch,
wifiMac,
ethMac,
wifi_connected,
eth_connected,
printf_provisions, adapter_get_devices());
return twin;
}
static void device_twin_reported(bool delivered, void *context)
{
char *twin_to_report = (char*)context;
if (delivered) {
FREE(s_diag.reported_device_twin);
s_diag.reported_device_twin = twin_to_report;
} else {
FREE(twin_to_report);
}
}
static void diag_report_twins(void)
{
char *twin_to_report = build_device_twin_to_report();
if (!twin_to_report) return;
if (s_diag.reported_device_twin == NULL || strcmp(twin_to_report, s_diag.reported_device_twin) != 0) {
LOGI("TWIN-UPDATE: %s", twin_to_report);
if (iot_report_device_twin_async(twin_to_report, device_twin_reported, twin_to_report) != 0) {
LOGE("Failed to report device twin");
FREE(twin_to_report);
}
} else {
FREE(twin_to_report);
}
}
static void diag_report_twins_cb(void *context)
{
if (network_is_connected() && iot_is_connected()) {
diag_report_twins();
}
}
static void report_network_status_change(Networking_InterfaceConnectionStatus status)
{
static Networking_InterfaceConnectionStatus last_network_status = 0;
if (status != last_network_status) {
last_network_status = status;
diag_log_event(network_status_to_event(status));
LOGI("network status [%u] %s", status, network_get_status_str(status));
}
}
// led
static void update_network_led(Networking_InterfaceConnectionStatus status)
{
static bool network_led_on = false;
int color = LED_OFF;
bool flash = false;
if (status & Networking_InterfaceConnectionStatus_ConnectedToInternet) {
color = iot_is_connected() ? LED_GREEN : LED_YELLOW;
flash = false;
} else if (status & Networking_InterfaceConnectionStatus_IpAvailable) {
color = LED_YELLOW;
flash = true;
} else if (status & Networking_InterfaceConnectionStatus_ConnectedToNetwork) {
color = LED_RED;
flash = false;
} else if (status & Networking_InterfaceConnectionStatus_InterfaceUp) {
color = LED_RED;
flash = true;
} else {
color = LED_OFF;
flash = false;
}
if (flash && network_led_on) {
led_set_color(NETWORK_LED, LED_OFF);
network_led_on = false;
} else {
led_set_color(NETWORK_LED, color);
network_led_on = true;
}
}
static void update_app_led(void)
{
const int LED_COLOR_FOR_DRIVER_STATE[] = {
LED_RED, // DRIVER_STATE_INIT
LED_RED, // DRIVER_STATE_OPENED
LED_YELLOW, // DRIVER_STATE_PARTIAL
LED_GREEN // DRIVER_STATE_NORMAL
};
static bool app_led_on = false;
if (app_led_on) {
led_set_color(APP_LED, LED_OFF);
app_led_on = false;
} else {
uint32_t state = adapter_get_driver_state();
led_set_color(APP_LED, LED_COLOR_FOR_DRIVER_STATE[state]);
app_led_on = true;
}
}
static void diag_led_update_cb(void *context)
{
Networking_InterfaceConnectionStatus status = network_get_status();
report_network_status_change(status);
update_network_led(status);
update_app_led();
}
static void detect_offline_recover_reboot(void)
{
struct timespec ts_now;
clock_gettime(CLOCK_BOOTTIME, &ts_now);
if (ts_now.tv_sec - s_diag.ts_last_d2c.tv_sec < DIAG_OFFLINE_SECOND_TO_REBOOT) return;
diag_log_event_to_file(EVENT_RECOVER_REBOOT);
LOGI("Try recover, force system reboot in 2s...");
sleep(2);
PowerManagement_ForceSystemReboot();
}
static void diag_heartbeat_cb(void *context)
{
static bool reported_memory_usage = false;
int heartbeat = diag_log_count_value("heartbeat");
LOGI("~~~~~heartbeat~~~~~~: %d", heartbeat);
// log memory usage if the peak usermode memory appoaches to the limit 256k
size_t peak_usermode_memory = Applications_GetPeakUserModeMemoryUsageInKB();
if (peak_usermode_memory >= DIAG_PEAK_USERMODE_MEMORY_WATERMARK) {
diag_log_value("peak_usermode_memory", peak_usermode_memory);
diag_log_value("usermode_memory", Applications_GetUserModeMemoryUsageInKB());
diag_log_value("total_memory", Applications_GetTotalMemoryUsageInKB());
reported_memory_usage = true;
}
else if (reported_memory_usage) {
diag_remove_value("peak_usermode_memory");
diag_remove_value("usermode_memory");
diag_remove_value("total_memory");
reported_memory_usage = false;
}
detect_offline_recover_reboot();
}
static int init_diag_event(void)
{
s_diag.num_events = 0;
if (pthread_mutex_init(&s_diag.lock, NULL) != 0) {
LOGE("mutex init has failed");
return -1;
}
int fd = Storage_OpenMutableFile();
if (fd < 0) {
LOGE("can't open mutable file");
return -1;
}
if (load_event_from_file(fd) > 0) {
clear_event_file(fd);
}
close(fd);
return 0;
}
static void detect_system_reboot(void)
{
struct timespec ts_uptime;
clock_gettime(CLOCK_BOOTTIME, &ts_uptime);
// If the boot timestamp is less than DIAG_SYSTEM_BOOT_TIME, there may be a kernel reboot.
if (ts_uptime.tv_sec <= DIAG_SYSTEM_BOOT_TIME) {
LOGI("System Reboot!");
diag_log_event(EVENT_SYSTEM_REBOOT);
}
}
static int schedule_heartbeat(void)
{
struct timespec ts_heartbeat = MS2SPEC(DIAG_HEARTBEAT_MS);
s_diag.heartbeat_timer = event_loop_register_timer(s_diag.eloop, &ts_heartbeat, &ts_heartbeat, diag_heartbeat_cb, NULL);
return s_diag.heartbeat_timer ? 0 : -1;
}
static int schedule_report_events(void)
{
struct timespec ts_event = MS2SPEC(DIAG_EVENT_REPORT_MS);
s_diag.report_events_timer = event_loop_register_timer(s_diag.eloop, &ts_event, &ts_event, diag_report_events_cb, NULL);
return s_diag.report_events_timer ? 0 : -1;
}
static int schedule_report_twin(void)
{
struct timespec ts_twin = MS2SPEC(DIAG_TWIN_REPORT_MS);
s_diag.report_twins_timer = event_loop_register_timer(s_diag.eloop, &ts_twin, &ts_twin, diag_report_twins_cb, NULL);
return s_diag.report_twins_timer ? 0 : -1;
}
static int schedule_report_telemetry(void)
{
struct timespec ts_telemetry = MS2SPEC(DIAG_TELEMETRY_REPORT_MS);
s_diag.report_telemetry_timer = event_loop_register_timer(s_diag.eloop, &ts_telemetry, &ts_telemetry, diag_report_telemetry_cb, NULL);
return s_diag.report_telemetry_timer ? 0 : -1;
}
static int schedule_report_log(void)
{
struct timespec ts_log = MS2SPEC(DIAG_LOG_REPORT_MS);
s_diag.report_log_timer = event_loop_register_timer(s_diag.eloop, &ts_log, &ts_log, diag_report_log_cb, NULL);
return s_diag.report_log_timer ? 0 : -1;
}
static int schedule_update_led(void)
{
struct timespec ts_led = MS2SPEC(DIAG_LED_UPDATE_MS);
s_diag.led_update_timer = event_loop_register_timer(s_diag.eloop, &ts_led, &ts_led, diag_led_update_cb, NULL);
return s_diag.led_update_timer ? 0 : -1;
}
static void free_diag_values(void)
{
while (s_diag.values) {
diag_value_t *p = s_diag.values;
s_diag.values = p->next;
FREE(p->key);
FREE(p);
}
}
// ---------------------------- public interface ------------------------------
int diag_init(EventLoop *eloop)
{
LOGI("diag init");
memset(&s_diag, 0, sizeof(s_diag));
s_diag.eloop = eloop;
clock_gettime(CLOCK_BOOTTIME, &s_diag.ts_app_start);
clock_gettime(CLOCK_BOOTTIME, &s_diag.ts_last_d2c);
// need to be initialized before try to log any event
if (init_diag_event() < 0) {
LOGE("failed to init diag event");
return -1;
}
detect_system_reboot();
diag_log_event(EVENT_RESTART);
if (schedule_heartbeat() != 0) {
LOGE("failed to schedule heart beat");
return -1;
}
if (schedule_report_events() != 0) {
LOGE("failed to schedule report events");
return -1;
}
if (schedule_report_twin() != 0) {
LOGE("failed to schedule report twin");
return -1;
}
if (schedule_report_telemetry() != 0) {
LOGE("failed to schedule report telemetry");
return -1;
}
if (schedule_report_log() != 0) {
LOGE("failed to schedule report log");
return -1;
}
if (schedule_update_led() != 0) {
LOGE("failed to schedule update led");
return -1;
}
return 0;
}
void diag_deinit()
{
free_diag_values();
event_loop_unregister_timer(s_diag.eloop, s_diag.heartbeat_timer);
event_loop_unregister_timer(s_diag.eloop, s_diag.report_events_timer);
event_loop_unregister_timer(s_diag.eloop, s_diag.report_twins_timer);
event_loop_unregister_timer(s_diag.eloop, s_diag.report_telemetry_timer);
event_loop_unregister_timer(s_diag.eloop, s_diag.report_log_timer);
event_loop_unregister_timer(s_diag.eloop, s_diag.led_update_timer);
pthread_mutex_destroy(&s_diag.lock);
FREE(s_diag.reported_device_twin);
}
double diag_get_value(const char *key)
{
diag_value_t *p = find_diag_value(key);
return p ? p->value : NAN;
}
void diag_remove_value(const char* key)
{
diag_value_t* prev = NULL;
diag_value_t* cur = s_diag.values;
while (cur) {
if (strcmp(cur->key, key) == 0) {
break;
}
prev = cur;
cur = cur->next;
}
if (cur) {
if (prev) {
prev->next = cur->next;
}
else {
s_diag.values = cur->next;
}
FREE(cur->key);
FREE(cur);
}
}
void diag_log_value(const char *key, double value)
{
diag_value_t *p = find_diag_value(key);
if (p) {
p->value = value;
} else {
p = (diag_value_t *)MALLOC(sizeof(diag_value_t));
p->key = STRDUP(key);
p->value = value;
p->next = s_diag.values;
s_diag.values = p;
}
}
int diag_log_count_value(const char *key)
{
double count = diag_get_value(key);
count = isnan(count) ? 1 : count + 1;
diag_log_value(key, count);
return count;
}
int diag_get_count_value(const char *key)
{
double count = diag_get_value(key);
return isnan(count) ? 0 : (int)count;
}
void diag_log_event(event_code_t code)
{
log_event_internal(code, false);
}
void diag_log_event_to_file(event_code_t code)
{
log_event_internal(code, true);
}