diff --git a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 index 97de4f3604..0ca31669b8 100644 --- a/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 +++ b/dockers/docker-dhcp-relay/docker-dhcp-relay.supervisord.conf.j2 @@ -136,6 +136,11 @@ command=/usr/sbin/dhcpmon -id {{ vlan_name }} {% for (name, prefix) in PORTCHANNEL_INTERFACE|pfx_filter %} {% if prefix | ipv4 %} -iu {{ name }}{% endif -%} {% endfor %} +{% if MGMT_INTERFACE %} +{% for (name, prefix) in MGMT_INTERFACE|pfx_filter %} +{% if prefix | ipv4 %} -im {{ name }}{% endif %} +{% endfor %} +{% endif %} priority=4 autostart=false diff --git a/src/dhcpmon/src/dhcp_device.c b/src/dhcpmon/src/dhcp_device.c index 554a1dcf0a..84ec9be9b6 100644 --- a/src/dhcpmon/src/dhcp_device.c +++ b/src/dhcpmon/src/dhcp_device.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,9 @@ #include "dhcp_device.h" +/** Counter print width */ +#define DHCP_COUNTER_WIDTH 10 + /** Start of Ether header of a captured frame */ #define ETHER_START_OFFSET 0 /** Start of IP header of a captured frame */ @@ -36,21 +40,6 @@ /** Offset of DHCP GIADDR */ #define DHCP_GIADDR_OFFSET 24 -/** - * DHCP message types - **/ -typedef enum -{ - DHCP_MESSAGE_TYPE_DISCOVER = 1, - DHCP_MESSAGE_TYPE_OFFER = 2, - DHCP_MESSAGE_TYPE_REQUEST = 3, - DHCP_MESSAGE_TYPE_DECLINE = 4, - DHCP_MESSAGE_TYPE_ACK = 5, - DHCP_MESSAGE_TYPE_NAK = 6, - DHCP_MESSAGE_TYPE_RELEASE = 7, - DHCP_MESSAGE_TYPE_INFORM = 8 -} dhcp_message_type; - #define OP_LDHA (BPF_LD | BPF_H | BPF_ABS) /** bpf ldh Abs */ #define OP_LDHI (BPF_LD | BPF_H | BPF_IND) /** bpf ldh Ind */ #define OP_LDB (BPF_LD | BPF_B | BPF_ABS) /** bpf ldb Abs*/ @@ -95,17 +84,21 @@ static struct sock_fprog dhcp_sock_bfp = { .len = sizeof(dhcp_bpf_code) / sizeof(*dhcp_bpf_code), .filter = dhcp_bpf_code }; -/** global aggregate counter for DHCP interfaces */ -static dhcp_device_counters_t glob_counters[DHCP_DIR_COUNT] = { - [DHCP_RX] = {.discover = 0, .offer = 0, .request = 0, .ack = 0}, - [DHCP_TX] = {.discover = 0, .offer = 0, .request = 0, .ack = 0}, +/** Aggregate device of DHCP interfaces. It contains aggregate counters from + all interfaces + */ +static dhcp_device_context_t aggregate_dev = {0}; + +/** Monitored DHCP message type */ +static dhcp_message_type_t monitored_msgs[] = { + DHCP_MESSAGE_TYPE_DISCOVER, + DHCP_MESSAGE_TYPE_OFFER, + DHCP_MESSAGE_TYPE_REQUEST, + DHCP_MESSAGE_TYPE_ACK }; -/** global aggregate counter snapshot for DHCP interfaces */ -static dhcp_device_counters_t glob_counters_snapshot[DHCP_DIR_COUNT] = { - [DHCP_RX] = {.discover = 0, .offer = 0, .request = 0, .ack = 0}, - [DHCP_TX] = {.discover = 0, .offer = 0, .request = 0, .ack = 0}, -}; +/** Number of monitored DHCP message type */ +static uint8_t monitored_msg_sz = sizeof(monitored_msgs) / sizeof(*monitored_msgs); /** * @code handle_dhcp_option_53(context, dhcp_option, dir, iphdr, dhcphdr); @@ -129,42 +122,29 @@ static void handle_dhcp_option_53(dhcp_device_context_t *context, in_addr_t giaddr; switch (dhcp_option[2]) { + // DHCP messages send by client case DHCP_MESSAGE_TYPE_DISCOVER: - giaddr = ntohl(dhcphdr[DHCP_GIADDR_OFFSET] << 24 | dhcphdr[DHCP_GIADDR_OFFSET + 1] << 16 | - dhcphdr[DHCP_GIADDR_OFFSET + 2] << 8 | dhcphdr[DHCP_GIADDR_OFFSET + 3]); - context->counters[dir].discover++; - if ((context->vlan_ip == giaddr && context->is_uplink && dir == DHCP_TX) || - (!context->is_uplink && dir == DHCP_RX && iphdr->ip_dst.s_addr == INADDR_BROADCAST)) { - glob_counters[dir].discover++; - } - break; - case DHCP_MESSAGE_TYPE_OFFER: - context->counters[dir].offer++; - if ((context->vlan_ip == iphdr->ip_dst.s_addr && context->is_uplink && dir == DHCP_RX) || - (!context->is_uplink && dir == DHCP_TX)) { - glob_counters[dir].offer++; - } - break; case DHCP_MESSAGE_TYPE_REQUEST: - giaddr = ntohl(dhcphdr[DHCP_GIADDR_OFFSET] << 24 | dhcphdr[DHCP_GIADDR_OFFSET + 1] << 16 | - dhcphdr[DHCP_GIADDR_OFFSET + 2] << 8 | dhcphdr[DHCP_GIADDR_OFFSET + 3]); - context->counters[dir].request++; - if ((context->vlan_ip == giaddr && context->is_uplink && dir == DHCP_TX) || - (!context->is_uplink && dir == DHCP_RX && iphdr->ip_dst.s_addr == INADDR_BROADCAST)) { - glob_counters[dir].request++; - } - break; - case DHCP_MESSAGE_TYPE_ACK: - context->counters[dir].ack++; - if ((context->vlan_ip == iphdr->ip_dst.s_addr && context->is_uplink && dir == DHCP_RX) || - (!context->is_uplink && dir == DHCP_TX)) { - glob_counters[dir].ack++; - } - break; case DHCP_MESSAGE_TYPE_DECLINE: - case DHCP_MESSAGE_TYPE_NAK: case DHCP_MESSAGE_TYPE_RELEASE: case DHCP_MESSAGE_TYPE_INFORM: + giaddr = ntohl(dhcphdr[DHCP_GIADDR_OFFSET] << 24 | dhcphdr[DHCP_GIADDR_OFFSET + 1] << 16 | + dhcphdr[DHCP_GIADDR_OFFSET + 2] << 8 | dhcphdr[DHCP_GIADDR_OFFSET + 3]); + if ((context->vlan_ip == giaddr && context->is_uplink && dir == DHCP_TX) || + (!context->is_uplink && dir == DHCP_RX && iphdr->ip_dst.s_addr == INADDR_BROADCAST)) { + context->counters[DHCP_COUNTERS_CURRENT][dir][dhcp_option[2]]++; + aggregate_dev.counters[DHCP_COUNTERS_CURRENT][dir][dhcp_option[2]]++; + } + break; + // DHCP messages send by server + case DHCP_MESSAGE_TYPE_OFFER: + case DHCP_MESSAGE_TYPE_ACK: + case DHCP_MESSAGE_TYPE_NAK: + if ((context->vlan_ip == iphdr->ip_dst.s_addr && context->is_uplink && dir == DHCP_RX) || + (!context->is_uplink && dir == DHCP_TX)) { + context->counters[DHCP_COUNTERS_CURRENT][dir][dhcp_option[2]]++; + aggregate_dev.counters[DHCP_COUNTERS_CURRENT][dir][dhcp_option[2]]++; + } break; default: syslog(LOG_WARNING, "handle_dhcp_option_53(%s): Unknown DHCP option 53 type %d", context->intf, dhcp_option[2]); @@ -242,62 +222,166 @@ static void read_callback(int fd, short event, void *arg) } /** - * @code dhcp_device_validate(counters, counters_snapshot); + * @code dhcp_device_is_dhcp_inactive(counters); * - * @brief validate current interface counters by comparing aggregate counter with snapshot counters. + * @brief Check if there were no DHCP activity * - * @param counters recent interface counter - * @param counters_snapshot snapshot counters + * @param counters current/snapshot counter * - * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE + * @return true if there were no DHCP activity, false otherwise */ -static dhcp_mon_status_t dhcp_device_validate(dhcp_device_counters_t *counters, - dhcp_device_counters_t *counters_snapshot) +static bool dhcp_device_is_dhcp_inactive(uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) { - dhcp_mon_status_t rv = DHCP_MON_STATUS_HEALTHY; + uint64_t *rx_counters = counters[DHCP_COUNTERS_CURRENT][DHCP_RX]; + uint64_t *rx_counter_snapshot = counters[DHCP_COUNTERS_SNAPSHOT][DHCP_RX]; - if ((counters[DHCP_RX].discover == counters_snapshot[DHCP_RX].discover) && - (counters[DHCP_RX].offer == counters_snapshot[DHCP_RX].offer ) && - (counters[DHCP_RX].request == counters_snapshot[DHCP_RX].request ) && - (counters[DHCP_RX].ack == counters_snapshot[DHCP_RX].ack ) ) { - rv = DHCP_MON_STATUS_INDETERMINATE; - } else { - // if we have rx DORA then we should have corresponding tx DORA (DORA being relayed) - if (((counters[DHCP_RX].discover > counters_snapshot[DHCP_RX].discover) && - (counters[DHCP_TX].discover <= counters_snapshot[DHCP_TX].discover) ) || - ((counters[DHCP_RX].offer > counters_snapshot[DHCP_RX].offer ) && - (counters[DHCP_TX].offer <= counters_snapshot[DHCP_TX].offer ) ) || - ((counters[DHCP_RX].request > counters_snapshot[DHCP_RX].request ) && - (counters[DHCP_TX].request <= counters_snapshot[DHCP_TX].request ) ) || - ((counters[DHCP_RX].ack > counters_snapshot[DHCP_RX].ack ) && - (counters[DHCP_TX].ack <= counters_snapshot[DHCP_TX].ack ) ) ) { - rv = DHCP_MON_STATUS_UNHEALTHY; - } + bool rv = true; + for (uint8_t i = 0; (i < monitored_msg_sz) && rv; i++) { + rv = rx_counters[monitored_msgs[i]] == rx_counter_snapshot[monitored_msgs[i]]; } return rv; } /** - * @code dhcp_print_counters(counters); + * @code dhcp_device_is_dhcp_msg_unhealthy(type, counters); + * + * @brief Check if DHCP relay is functioning properly for message of type 'type'. + * For every rx of message 'type', there should be increment of the same message type. + * + * @param type DHCP message type + * @param counters current/snapshot counter + * + * @return true if DHCP message 'type' is transmitted,false otherwise + */ +static bool dhcp_device_is_dhcp_msg_unhealthy(dhcp_message_type_t type, + uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) +{ + // check if DHCP message 'type' is being relayed + return ((counters[DHCP_COUNTERS_CURRENT][DHCP_RX][type] > counters[DHCP_COUNTERS_SNAPSHOT][DHCP_RX][type]) && + (counters[DHCP_COUNTERS_CURRENT][DHCP_TX][type] <= counters[DHCP_COUNTERS_SNAPSHOT][DHCP_TX][type]) ); +} + +/** + * @code dhcp_device_check_positive_health(counters, counters_snapshot); + * + * @brief Check if DHCP relay is functioning properly for monitored messages (Discover, Offer, Request, ACK.) + * For every rx of monitored messages, there should be increment of the same message type. + * + * @param counters current/snapshot counter + * + * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE + */ +static dhcp_mon_status_t dhcp_device_check_positive_health(uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) +{ + dhcp_mon_status_t rv = DHCP_MON_STATUS_HEALTHY; + + bool is_dhcp_unhealthy = false; + for (uint8_t i = 0; (i < monitored_msg_sz) && !is_dhcp_unhealthy; i++) { + is_dhcp_unhealthy = dhcp_device_is_dhcp_msg_unhealthy(monitored_msgs[i], counters); + } + + // if we have rx DORA then we should have corresponding tx DORA (DORA being relayed) + if (is_dhcp_unhealthy) { + rv = DHCP_MON_STATUS_UNHEALTHY; + } + + return rv; +} + +/** + * @code dhcp_device_check_negative_health(counters); + * + * @brief Check that DHCP relayed messages are not being transmitted out of this interface/dev + * using its counters. The interface is negatively healthy if there are not DHCP message + * travelling through it. + * + * @param counters recent interface counter + * @param counters_snapshot snapshot counters + * + * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE + */ +static dhcp_mon_status_t dhcp_device_check_negative_health(uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) +{ + dhcp_mon_status_t rv = DHCP_MON_STATUS_HEALTHY; + + uint64_t *tx_counters = counters[DHCP_COUNTERS_CURRENT][DHCP_TX]; + uint64_t *tx_counter_snapshot = counters[DHCP_COUNTERS_SNAPSHOT][DHCP_TX]; + + bool is_dhcp_unhealthy = false; + for (uint8_t i = 0; (i < monitored_msg_sz) && !is_dhcp_unhealthy; i++) { + is_dhcp_unhealthy = tx_counters[monitored_msgs[i]] > tx_counter_snapshot[monitored_msgs[i]]; + } + + // for negative validation, return unhealthy if DHCP packet are being + // transmitted out of the device/interface + if (is_dhcp_unhealthy) { + rv = DHCP_MON_STATUS_UNHEALTHY; + } + + return rv; +} + +/** + * @code dhcp_device_check_health(check_type, counters, counters_snapshot); + * + * @brief Check that DHCP relay is functioning properly given a check type. Positive check + * indicates for every rx of DHCP message of type 'type', there would increment of + * the corresponding TX of the same message type. While negative check indicates the + * device should not be actively transmitting any DHCP messages. If it does, it is + * considered unhealthy. + * + * @param check_type type of health check + * @param counters current/snapshot counter + * + * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE + */ +static dhcp_mon_status_t dhcp_device_check_health(dhcp_mon_check_t check_type, + uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) +{ + dhcp_mon_status_t rv = DHCP_MON_STATUS_HEALTHY; + + if (dhcp_device_is_dhcp_inactive(aggregate_dev.counters)) { + rv = DHCP_MON_STATUS_INDETERMINATE; + } else if (check_type == DHCP_MON_CHECK_POSITIVE) { + rv = dhcp_device_check_positive_health(counters); + } else if (check_type == DHCP_MON_CHECK_NEGATIVE) { + rv = dhcp_device_check_negative_health(counters); + } + + return rv; +} + +/** + * @code dhcp_print_counters(vlan_intf, counters); * * @brief prints DHCP counters to sylsog. * * @param counters interface counter */ -static void dhcp_print_counters(dhcp_device_counters_t *counters) +static void dhcp_print_counters(const char *vlan_intf, uint64_t counters[][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]) { - syslog(LOG_NOTICE, "DHCP Discover rx: %lu, tx:%lu, Offer rx: %lu, tx:%lu, Request rx: %lu, tx:%lu, ACK rx: %lu, tx:%lu\n", - counters[DHCP_RX].discover, counters[DHCP_TX].discover, - counters[DHCP_RX].offer, counters[DHCP_TX].offer, - counters[DHCP_RX].request, counters[DHCP_TX].request, - counters[DHCP_RX].ack, counters[DHCP_TX].ack); + uint64_t *rx_counters = counters[DHCP_COUNTERS_CURRENT][DHCP_RX]; + uint64_t *tx_counters = counters[DHCP_COUNTERS_CURRENT][DHCP_TX]; + + syslog(LOG_NOTICE, + "[%*s] DHCP Discover rx/tx: %*lu/%*lu, Offer rx/tx: %*lu/%*lu, " + "Request rx/tx: %*lu/%*lu, ACK rx/tx: %*lu/%*lu\n", + IF_NAMESIZE, vlan_intf, + DHCP_COUNTER_WIDTH, rx_counters[DHCP_MESSAGE_TYPE_DISCOVER], + DHCP_COUNTER_WIDTH, tx_counters[DHCP_MESSAGE_TYPE_DISCOVER], + DHCP_COUNTER_WIDTH, rx_counters[DHCP_MESSAGE_TYPE_OFFER], + DHCP_COUNTER_WIDTH, tx_counters[DHCP_MESSAGE_TYPE_OFFER], + DHCP_COUNTER_WIDTH, rx_counters[DHCP_MESSAGE_TYPE_REQUEST], + DHCP_COUNTER_WIDTH, tx_counters[DHCP_MESSAGE_TYPE_REQUEST], + DHCP_COUNTER_WIDTH, rx_counters[DHCP_MESSAGE_TYPE_ACK], + DHCP_COUNTER_WIDTH, tx_counters[DHCP_MESSAGE_TYPE_ACK]); } /** * @code init_socket(context, intf); * - * @brief initializes socket, bind it to interface and bpf prgram, and + * @brief initializes socket, bind it to interface and bpf program, and * associate with libevent base * * @param context pointer to device (interface) context @@ -403,6 +487,18 @@ int dhcp_device_get_ip(dhcp_device_context_t *context, in_addr_t *ip) return rv; } +/** + * @code dhcp_device_get_aggregate_context(); + * + * @brief Accessor method + * + * @return pointer to aggregate device (interface) context + */ +dhcp_device_context_t* dhcp_device_get_aggregate_context() +{ + return &aggregate_dev; +} + /** * @code dhcp_device_init(context, intf, is_uplink); * @@ -422,8 +518,7 @@ int dhcp_device_init(dhcp_device_context_t **context, const char *intf, uint8_t dev_context->is_uplink = is_uplink; - memset(&dev_context->counters, 0, sizeof(dev_context->counters)); - memset(&dev_context->counters_snapshot, 0, sizeof(dev_context->counters_snapshot)); + memset(dev_context->counters, 0, sizeof(dev_context->counters)); *context = dev_context; rv = 0; @@ -498,36 +593,44 @@ void dhcp_device_shutdown(dhcp_device_context_t *context) } /** - * @code dhcp_device_get_status(context); + * @code dhcp_device_get_status(check_type, context); * * @brief collects DHCP relay status info for a given interface. If context is null, it will report aggregate * status */ -dhcp_mon_status_t dhcp_device_get_status(dhcp_device_context_t *context) +dhcp_mon_status_t dhcp_device_get_status(dhcp_mon_check_t check_type, dhcp_device_context_t *context) { - dhcp_mon_status_t rv = 0; + dhcp_mon_status_t rv = DHCP_MON_STATUS_HEALTHY; if (context != NULL) { - rv = dhcp_device_validate(context->counters, context->counters_snapshot); - memcpy(context->counters_snapshot, context->counters, sizeof(context->counters_snapshot)); - } else { - rv = dhcp_device_validate(glob_counters, glob_counters_snapshot); - memcpy(glob_counters_snapshot, glob_counters, sizeof(glob_counters_snapshot)); + rv = dhcp_device_check_health(check_type, context->counters); } return rv; } /** - * @code dhcp_device_print_status(); + * @code dhcp_device_update_snapshot(context); * - * @brief prints status counters to syslog. If context is null, it will print aggregate status + * @brief Update device/interface counters snapshot + */ +void dhcp_device_update_snapshot(dhcp_device_context_t *context) +{ + if (context != NULL) { + memcpy(context->counters[DHCP_COUNTERS_SNAPSHOT], + context->counters[DHCP_COUNTERS_CURRENT], + sizeof(context->counters[DHCP_COUNTERS_SNAPSHOT])); + } +} + +/** + * @code dhcp_device_print_status(context); + * + * @brief prints status counters to syslog. */ void dhcp_device_print_status(dhcp_device_context_t *context) { if (context != NULL) { - dhcp_print_counters(context->counters); - } else { - dhcp_print_counters(glob_counters); + dhcp_print_counters(context->intf, context->counters); } } diff --git a/src/dhcpmon/src/dhcp_device.h b/src/dhcpmon/src/dhcp_device.h index bc1582d46a..9eee669101 100644 --- a/src/dhcpmon/src/dhcp_device.h +++ b/src/dhcpmon/src/dhcp_device.h @@ -17,6 +17,23 @@ #include +/** + * DHCP message types + **/ +typedef enum +{ + DHCP_MESSAGE_TYPE_DISCOVER = 1, + DHCP_MESSAGE_TYPE_OFFER = 2, + DHCP_MESSAGE_TYPE_REQUEST = 3, + DHCP_MESSAGE_TYPE_DECLINE = 4, + DHCP_MESSAGE_TYPE_ACK = 5, + DHCP_MESSAGE_TYPE_NAK = 6, + DHCP_MESSAGE_TYPE_RELEASE = 7, + DHCP_MESSAGE_TYPE_INFORM = 8, + + DHCP_MESSAGE_TYPE_COUNT +} dhcp_message_type_t; + /** packet direction */ typedef enum { @@ -26,6 +43,15 @@ typedef enum DHCP_DIR_COUNT } dhcp_packet_direction_t; +/** counters type */ +typedef enum +{ + DHCP_COUNTERS_CURRENT, /** DHCP current counters */ + DHCP_COUNTERS_SNAPSHOT, /** DHCP snapshot counters */ + + DHCP_COUNTERS_COUNT +} dhcp_counters_type_t; + /** dhcp health status */ typedef enum { @@ -34,14 +60,12 @@ typedef enum DHCP_MON_STATUS_INDETERMINATE, /** DHCP relay health could not be determined */ } dhcp_mon_status_t; -/** DHCP device (interface) health counters */ -typedef struct +/** dhcp check type */ +typedef enum { - uint64_t discover; /** DHCP discover packets */ - uint64_t offer; /** DHCP offer packets */ - uint64_t request; /** DHCP request packets */ - uint64_t ack; /** DHCP ack packets */ -} dhcp_device_counters_t; + DHCP_MON_CHECK_NEGATIVE, /** Presence of relayed DHCP packets activity is flagged as unhealthy state */ + DHCP_MON_CHECK_POSITIVE, /** Validate that received DORA packets are relayed */ +} dhcp_mon_check_t; /** DHCP device (interface) context */ typedef struct @@ -54,10 +78,8 @@ typedef struct char intf[IF_NAMESIZE]; /** device (interface) name */ uint8_t *buffer; /** buffer used to read socket data */ size_t snaplen; /** snap length or buffer size */ - dhcp_device_counters_t counters[DHCP_DIR_COUNT]; - /** current coutners of DORA packets */ - dhcp_device_counters_t counters_snapshot[DHCP_DIR_COUNT]; - /** counter snapshot */ + uint64_t counters[DHCP_COUNTERS_COUNT][DHCP_DIR_COUNT][DHCP_MESSAGE_TYPE_COUNT]; + /** current/snapshot counters of DHCP packets */ } dhcp_device_context_t; /** @@ -72,6 +94,15 @@ typedef struct */ int dhcp_device_get_ip(dhcp_device_context_t *context, in_addr_t *ip); +/** + * @code dhcp_device_get_aggregate_context(); + * + * @brief Accessor method + * + * @return pointer to aggregate device (interface) context + */ +dhcp_device_context_t* dhcp_device_get_aggregate_context(); + /** * @code dhcp_device_init(context, intf, is_uplink); * @@ -116,22 +147,34 @@ int dhcp_device_start_capture(dhcp_device_context_t *context, void dhcp_device_shutdown(dhcp_device_context_t *context); /** - * @code dhcp_device_get_status(context); + * @code dhcp_device_get_status(check_type, context); * * @brief collects DHCP relay status info for a given interface. If context is null, it will report aggregate * status * - * @param context Device (interface) context + * @param check_type Type of validation + * @param context Device (interface) context * * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE */ -dhcp_mon_status_t dhcp_device_get_status(dhcp_device_context_t *context); +dhcp_mon_status_t dhcp_device_get_status(dhcp_mon_check_t check_type, dhcp_device_context_t *context); /** - * @code dhcp_device_print_status(); + * @code dhcp_device_update_snapshot(context); + * + * @param context Device (interface) context + * + * @brief Update device/interface counters snapshot + */ +void dhcp_device_update_snapshot(dhcp_device_context_t *context); + +/** + * @code dhcp_device_print_status(context); * * @brief prints status counters to syslog. If context is null, it will print aggregate status * + * @param context Device (interface) context + * * @return none */ void dhcp_device_print_status(dhcp_device_context_t *context); diff --git a/src/dhcpmon/src/dhcp_devman.c b/src/dhcpmon/src/dhcp_devman.c index 077ed210a2..e9a47ae581 100644 --- a/src/dhcpmon/src/dhcp_devman.c +++ b/src/dhcpmon/src/dhcp_devman.c @@ -12,6 +12,9 @@ #include "dhcp_devman.h" +/** Prefix appended to Aggregation device */ +#define AGG_DEV_PREFIX "Agg-" + /** struct for interface information */ struct intf { @@ -27,22 +30,34 @@ static LIST_HEAD(intf_list, intf) intfs; static uint32_t dhcp_num_south_intf = 0; /** dhcp_num_north_intf number of north interfaces */ static uint32_t dhcp_num_north_intf = 0; +/** dhcp_num_mgmt_intf number of mgmt interfaces */ +static uint32_t dhcp_num_mgmt_intf = 0; /** On Device vlan interface IP address corresponding vlan downlink IP * This IP is used to filter Offer/Ack packet coming from DHCP server */ static in_addr_t vlan_ip = 0; -/** vlan interface name */ -static char vlan_intf[IF_NAMESIZE] = "Undefined"; +/** mgmt interface */ +static struct intf *mgmt_intf = NULL; /** * @code dhcp_devman_get_vlan_intf(); * * Accessor method */ -const char* dhcp_devman_get_vlan_intf() +dhcp_device_context_t* dhcp_devman_get_agg_dev() { - return vlan_intf; + return dhcp_device_get_aggregate_context(); +} + +/** + * @code dhcp_devman_get_mgmt_dev(); + * + * Accessor method + */ +dhcp_device_context_t* dhcp_devman_get_mgmt_dev() +{ + return mgmt_intf ? mgmt_intf->dev_context : NULL; } /** @@ -86,27 +101,42 @@ void dhcp_devman_shutdown() * * @brief adds interface to the device manager. */ -int dhcp_devman_add_intf(const char *name, uint8_t is_uplink) +int dhcp_devman_add_intf(const char *name, char intf_type) { int rv = -1; struct intf *dev = malloc(sizeof(struct intf)); if (dev != NULL) { dev->name = name; - dev->is_uplink = is_uplink; - if (is_uplink) { + dev->is_uplink = intf_type != 'd'; + + switch (intf_type) + { + case 'u': dhcp_num_north_intf++; - } else { + break; + case 'd': dhcp_num_south_intf++; assert(dhcp_num_south_intf <= 1); + break; + case 'm': + dhcp_num_mgmt_intf++; + assert(dhcp_num_mgmt_intf <= 1); + mgmt_intf = dev; + break; + default: + break; } rv = dhcp_device_init(&dev->dev_context, dev->name, dev->is_uplink); - if (rv == 0 && !is_uplink) { + if (rv == 0 && intf_type == 'd') { rv = dhcp_device_get_ip(dev->dev_context, &vlan_ip); - strncpy(vlan_intf, name, sizeof(vlan_intf) - 1); - vlan_intf[sizeof(vlan_intf) - 1] = '\0'; + dhcp_device_context_t *agg_dev = dhcp_device_get_aggregate_context(); + + strncpy(agg_dev->intf, AGG_DEV_PREFIX, sizeof(AGG_DEV_PREFIX)); + strncpy(agg_dev->intf + sizeof(AGG_DEV_PREFIX) - 1, name, sizeof(agg_dev->intf) - sizeof(AGG_DEV_PREFIX)); + agg_dev->intf[sizeof(agg_dev->intf) - 1] = '\0'; } LIST_INSERT_HEAD(&intfs, dev, entry); @@ -152,21 +182,41 @@ int dhcp_devman_start_capture(size_t snaplen, struct event_base *base) } /** - * @code dhcp_devman_get_status(); + * @code dhcp_devman_get_status(check_type, context); * * @brief collects DHCP relay status info. */ -dhcp_mon_status_t dhcp_devman_get_status() +dhcp_mon_status_t dhcp_devman_get_status(dhcp_mon_check_t check_type, dhcp_device_context_t *context) { - return dhcp_device_get_status(NULL); + return dhcp_device_get_status(check_type, context); } /** - * @code dhcp_devman_print_status(); + * @code dhcp_devman_update_snapshot(context); * - * @brief prints status counters to syslog + * @brief Update device/interface counters snapshot */ -void dhcp_devman_print_status() +void dhcp_devman_update_snapshot(dhcp_device_context_t *context) { - dhcp_device_print_status(NULL); + dhcp_device_update_snapshot(context); +} + +/** + * @code dhcp_devman_print_status(context); + * + * @brief prints status counters to syslog, if context is null, it prints status counters for all interfaces + */ +void dhcp_devman_print_status(dhcp_device_context_t *context) +{ + if (context == NULL) { + struct intf *int_ptr; + + LIST_FOREACH(int_ptr, &intfs, entry) { + dhcp_device_print_status(int_ptr->dev_context); + } + + dhcp_device_print_status(dhcp_devman_get_agg_dev()); + } else { + dhcp_device_print_status(context); + } } diff --git a/src/dhcpmon/src/dhcp_devman.h b/src/dhcpmon/src/dhcp_devman.h index 2f66fa407c..4d8a532657 100644 --- a/src/dhcpmon/src/dhcp_devman.h +++ b/src/dhcpmon/src/dhcp_devman.h @@ -36,9 +36,18 @@ void dhcp_devman_shutdown(); * * @brief Accessor method * - * @return pointer to vlan ip interface name + * @return pointer to aggregate device (interface) context */ -const char* dhcp_devman_get_vlan_intf(); +dhcp_device_context_t* dhcp_devman_get_agg_dev(); + +/** + * @code dhcp_devman_get_mgmt_intf_context(); + * + * @brief Accessor method + * + * @return pointer to mgmt interface context + */ +dhcp_device_context_t* dhcp_devman_get_mgmt_dev(); /** * @code dhcp_devman_add_intf(name, uplink); @@ -46,11 +55,13 @@ const char* dhcp_devman_get_vlan_intf(); * @brief adds interface to the device manager. * * @param name interface name - * @param is_uplink true for uplink (north) interface + * @param intf_type 'u' for uplink (north) interface + * 'd' for downlink (south) interface + * 'm' for mgmt interface * * @return 0 on success, nonzero otherwise */ -int dhcp_devman_add_intf(const char *name, uint8_t is_uplink); +int dhcp_devman_add_intf(const char *name, char intf_type); /** * @code dhcp_devman_start_capture(snaplen, base); @@ -65,21 +76,35 @@ int dhcp_devman_add_intf(const char *name, uint8_t is_uplink); int dhcp_devman_start_capture(size_t snaplen, struct event_base *base); /** - * @code dhcp_devman_get_status(); + * @code dhcp_devman_get_status(check_type, context); * * @brief collects DHCP relay status info. * + * @param check_type Type of validation + * @param context pointer to device (interface) context + * * @return DHCP_MON_STATUS_HEALTHY, DHCP_MON_STATUS_UNHEALTHY, or DHCP_MON_STATUS_INDETERMINATE */ -dhcp_mon_status_t dhcp_devman_get_status(); +dhcp_mon_status_t dhcp_devman_get_status(dhcp_mon_check_t check_type, dhcp_device_context_t *context); /** - * @code dhcp_devman_print_status(); + * @code dhcp_devman_update_snapshot(context); + * + * @param context Device (interface) context + * + * @brief Update device/interface counters snapshot + */ +void dhcp_devman_update_snapshot(dhcp_device_context_t *context); + +/** + * @code dhcp_devman_print_status(context); * * @brief prints status counters to syslog * + * @param context pointer to device (interface) context + * * @return none */ -void dhcp_devman_print_status(); +void dhcp_devman_print_status(dhcp_device_context_t *context); #endif /* DHCP_DEVMAN_H_ */ diff --git a/src/dhcpmon/src/dhcp_mon.c b/src/dhcpmon/src/dhcp_mon.c index 623bc46f52..8147b5057c 100644 --- a/src/dhcpmon/src/dhcp_mon.c +++ b/src/dhcpmon/src/dhcp_mon.c @@ -16,8 +16,17 @@ #include "dhcp_mon.h" #include "dhcp_devman.h" +/** DHCP device/interface state */ +typedef struct +{ + dhcp_mon_check_t check_type; /** check type */ + dhcp_device_context_t* (*get_context)(); /** functor to a device context accessor function */ + int count; /** count in the number of unhealthy checks */ + const char *msg; /** message to be printed if unhealthy state is determined */ +} dhcp_mon_state_t; + /** window_interval_sec monitoring window for dhcp relay health checks */ -static int window_interval_sec = 12; +static int window_interval_sec = 18; /** dhcp_unhealthy_max_count max count of consecutive unhealthy statuses before reporting to syslog */ static int dhcp_unhealthy_max_count = 10; /** libevent base struct */ @@ -28,6 +37,25 @@ static struct event *ev_timeout = NULL; static struct event *ev_sigint; /** libevent SIGTERM signal event struct */ static struct event *ev_sigterm; +/** libevent SIGUSR1 signal event struct */ +static struct event *ev_sigusr1; + +/** DHCP monitor state data for aggregate device for mgmt device */ +static dhcp_mon_state_t state_data[] = { + [0] = { + .check_type = DHCP_MON_CHECK_POSITIVE, + .get_context = dhcp_devman_get_agg_dev, + .count = 0, + .msg = "dhcpmon detected disparity in DHCP Relay behavior. Duration: %d (sec) for vlan: '%s'\n" + }, + [1] = { + .check_type = DHCP_MON_CHECK_NEGATIVE, + .get_context = dhcp_devman_get_mgmt_dev, + .count = 0, + .msg = "dhcpmon detected DHCP packets traveling through mgmt interface (please check BGP routes.)" + " Duration: %d (sec) for intf: '%s'\n" + } +}; /** * @code signal_callback(fd, event, arg); @@ -42,9 +70,47 @@ static struct event *ev_sigterm; */ static void signal_callback(evutil_socket_t fd, short event, void *arg) { - syslog(LOG_ALERT, "Received signal %s\n", strsignal(fd)); - dhcp_devman_print_status(); - dhcp_mon_stop(); + syslog(LOG_ALERT, "Received signal: '%s'\n", strsignal(fd)); + dhcp_devman_print_status(NULL); + if ((fd == SIGTERM) || (fd == SIGINT)) { + dhcp_mon_stop(); + } +} + +/** + * @code check_dhcp_relay_health(state_data); + * + * @brief check DHCP relay overall health + * + * @param state_data pointer to dhcpmon state data + * + * @return none + */ +static void check_dhcp_relay_health(dhcp_mon_state_t *state_data) +{ + dhcp_device_context_t *context = state_data->get_context(); + dhcp_mon_status_t dhcp_mon_status = dhcp_devman_get_status(state_data->check_type, context); + + switch (dhcp_mon_status) + { + case DHCP_MON_STATUS_UNHEALTHY: + if (++state_data->count > dhcp_unhealthy_max_count) { + syslog(LOG_ALERT, state_data->msg, state_data->count * window_interval_sec, context->intf); + dhcp_devman_print_status(context); + } + break; + case DHCP_MON_STATUS_HEALTHY: + state_data->count = 0; + break; + case DHCP_MON_STATUS_INDETERMINATE: + if (state_data->count) { + state_data->count++; + } + break; + default: + syslog(LOG_ERR, "DHCP Relay returned unknown status %d\n", dhcp_mon_status); + break; + } } /** @@ -60,28 +126,12 @@ static void signal_callback(evutil_socket_t fd, short event, void *arg) */ static void timeout_callback(evutil_socket_t fd, short event, void *arg) { - static int count = 0; - dhcp_mon_status_t dhcp_mon_status = dhcp_devman_get_status(); + for (uint8_t i = 0; i < sizeof(state_data) / sizeof(*state_data); i++) { + check_dhcp_relay_health(&state_data[i]); + } - switch (dhcp_mon_status) - { - case DHCP_MON_STATUS_UNHEALTHY: - if (++count > dhcp_unhealthy_max_count) { - syslog(LOG_ALERT, "dhcpmon detected disparity in DHCP Relay behavior. Failure count: %d for vlan: '%s'\n", - count, dhcp_devman_get_vlan_intf()); - dhcp_devman_print_status(); - } - break; - case DHCP_MON_STATUS_HEALTHY: - if (count > 0) { - count = 0; - } - break; - case DHCP_MON_STATUS_INDETERMINATE: - break; - default: - syslog(LOG_ERR, "DHCP Relay returned unknown status %d\n", dhcp_mon_status); - break; + for (uint8_t i = 0; i < sizeof(state_data) / sizeof(*state_data); i++) { + dhcp_devman_update_snapshot(state_data[i].get_context()); } } @@ -118,6 +168,12 @@ int dhcp_mon_init(int window_sec, int max_count) break; } + ev_sigusr1 = evsignal_new(base, SIGUSR1, signal_callback, base); + if (ev_sigusr1 == NULL) { + syslog(LOG_ERR, "Could not create SIGUSER1 libevent signal!\n"); + break; + } + ev_timeout = event_new(base, -1, EV_PERSIST, timeout_callback, base); if (ev_timeout == NULL) { syslog(LOG_ERR, "Could not create libevent timer!\n"); @@ -140,10 +196,12 @@ void dhcp_mon_shutdown() event_del(ev_timeout); event_del(ev_sigint); event_del(ev_sigterm); + event_del(ev_sigusr1); event_free(ev_timeout); event_free(ev_sigint); event_free(ev_sigterm); + event_free(ev_sigusr1); event_base_free(base); } @@ -173,6 +231,11 @@ int dhcp_mon_start(size_t snaplen) break; } + if (evsignal_add(ev_sigusr1, NULL) != 0) { + syslog(LOG_ERR, "Could not add SIGUSR1 libevent signal!\n"); + break; + } + struct timeval event_time = {.tv_sec = window_interval_sec, .tv_usec = 0}; if (evtimer_add(ev_timeout, &event_time) != 0) { syslog(LOG_ERR, "Could not add event timer to libevent!\n"); diff --git a/src/dhcpmon/src/main.c b/src/dhcpmon/src/main.c index 9d155393a6..bb8c45867f 100644 --- a/src/dhcpmon/src/main.c +++ b/src/dhcpmon/src/main.c @@ -24,7 +24,7 @@ static const size_t dhcpmon_default_snaplen = 65535; /** dhcpmon_default_health_check_window: default value for a time window, during which DHCP DORA packet counts are being * collected */ -static const uint32_t dhcpmon_default_health_check_window = 12; +static const uint32_t dhcpmon_default_health_check_window = 18; /** dhcpmon_default_unhealthy_max_count: default max consecutive unhealthy status reported before reporting an issue * with DHCP relay */ static const uint32_t dhcpmon_default_unhealthy_max_count = 10; @@ -40,7 +40,7 @@ static const uint32_t dhcpmon_default_unhealthy_max_count = 10; */ static void usage(const char *prog) { - printf("Usage: %s -id {-iu }+ [-w ]" + printf("Usage: %s -id {-iu }+ -im [-w ]" "[-c ] [-s ] [-d]\n", prog); printf("where\n"); printf("\tsouth interface: is a vlan interface,\n"); @@ -50,7 +50,7 @@ static void usage(const char *prog) printf("\tunhealthy status count: count of consecutive unhealthy status before writing an alert to syslog " "(default %d),\n", dhcpmon_default_unhealthy_max_count); - printf("\tsnap length: snap length of packet capture (default %d),\n", dhcpmon_default_snaplen); + printf("\tsnap length: snap length of packet capture (default %ld),\n", dhcpmon_default_snaplen); printf("\t-d: daemonize %s.\n", prog); exit(EXIT_SUCCESS); @@ -127,7 +127,7 @@ int main(int argc, char **argv) usage(basename(argv[0])); break; case 'i': - if (dhcp_devman_add_intf(argv[i + 1], argv[i][2] == 'u') != 0) { + if (dhcp_devman_add_intf(argv[i + 1], argv[i][2]) != 0) { usage(basename(argv[0])); } i += 2; diff --git a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf index 6eeba48fe9..435d535fb3 100644 --- a/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf +++ b/src/sonic-config-engine/tests/sample_output/docker-dhcp-relay.supervisord.conf @@ -55,7 +55,7 @@ dependent_startup_wait_for=start:exited programs=dhcpmon-Vlan1000 [program:dhcpmon-Vlan1000] -command=/usr/sbin/dhcpmon -id Vlan1000 -iu PortChannel01 -iu PortChannel02 -iu PortChannel03 -iu PortChannel04 +command=/usr/sbin/dhcpmon -id Vlan1000 -iu PortChannel01 -iu PortChannel02 -iu PortChannel03 -iu PortChannel04 -im eth0 priority=4 autostart=false autorestart=false