VOXL OpenVINS Server 1.0
Visual Inertial Odometry Server for VOXL Platform
Loading...
Searching...
No Matches
VoxlHealth.cpp
Go to the documentation of this file.
1/**
2 * @file VoxlHealth.cpp
3 * @brief Health check implementation for VOXL OpenVINS
4 * @author Zauberflote
5 * @date 2025
6 * @version 1.0
7 *
8 * This file implements the health check system for the VOXL OpenVINS server.
9 */
10
11#include "VoxlHK.h"
12using namespace voxl;
13// ============================================================================
14// HEALTH CHECK IMPLEMENTATION
15// ============================================================================
16
17/**
18 * @brief Constructor for HealthCheck
19 *
20 * Initializes the health check system with default values.
21 * The health check is not started until start() is called.
22 */
23HealthCheck::HealthCheck()
24{
25 // Initialize with current system state
26 last_error_codes_ = vio_error_codes.load();
27 last_vio_state_ = vio_state.load();
28 last_imu_connected_ = is_imu_connected.load();
29 last_cam_connected_ = is_cam_connected.load();
30 last_health_check_ns_ = _apps_time_monotonic_ns();
31}
32
33/**
34 * @brief Destructor for HealthCheck
35 *
36 * Ensures proper cleanup by calling stop() if the health check is still running.
37 */
38HealthCheck::~HealthCheck()
39{
40 stop();
41}
42
43/**
44 * @brief Start the health check system
45 *
46 * Initializes and starts the health monitoring thread that runs at 30Hz.
47 * The thread continuously monitors system health and error conditions.
48 */
50{
51 std::lock_guard<std::mutex> lock(health_mutex_);
52
53 if (running_.load())
54 {
55 std::cerr << "HealthCheck already running" << std::endl;
56 return;
57 }
58
59 running_.store(true, std::memory_order_release);
60 health_thread_ = std::thread(&HealthCheck::healthCheckLoop, this);
61 health_thread_.detach();
62
63 std::cout << "HealthCheck started - monitoring at 30Hz" << std::endl;
64}
65
66/**
67 * @brief Stop the health check system
68 *
69 * Stops the health monitoring thread and performs cleanup.
70 */
72{
73 std::lock_guard<std::mutex> lock(health_mutex_);
74
75 if (!running_.load())
76 {
77 return;
78 }
79
80 running_.store(false, std::memory_order_release);
81
82 // Give the thread a moment to finish
83 std::this_thread::sleep_for(std::chrono::milliseconds(100));
84
85 std::cout << "HealthCheck stopped" << std::endl;
86}
87
88/**
89 * @brief Main health check loop
90 *
91 * Runs at 30Hz and performs comprehensive health monitoring including:
92 * - Error code analysis and logging
93 * - System state validation
94 * - Performance monitoring
95 * - Auto-reset condition checking
96 */
97void HealthCheck::healthCheckLoop()
98{
99 const int64_t health_check_period_ns = 33333333; // 30Hz = ~33.33ms
100
101 while (running_.load() && main_running)
102 {
103 auto start_time = _apps_time_monotonic_ns();
104 // Update connectivity status first
105 checkSystemConnectivity();
106
107 // Publish blank VIO data packets when sensors are missing
108 if (!is_imu_connected.load() || !is_cam_connected.load())
109 {
110 if (!is_imu_connected.load())
111 std::cerr << "[HEALTH] ERROR: IMU disconnected; publishing blank VIO data" << std::endl;
112 if (!is_cam_connected.load())
113 std::cerr << "[HEALTH] ERROR: Camera disconnected; publishing blank VIO data" << std::endl;
114 std::cout << "[HEALTH] Publishing blank VIO packet due to missing sensors" << std::endl;
115 Publisher::getInstance().publishBlank();
116 }
117 else
118 {
119 analyzeErrorCodes();
120 monitorSystemPerformance();
121 checkAutoResetConditions();
122 checkVINSResetRequest();
123 }
124
125 // Update counters
126 health_check_count_++;
127 last_health_check_ns_ = start_time;
128
129 // Sleep to maintain 30Hz rate
130 int64_t elapsed_ns = _apps_time_monotonic_ns() - start_time;
131 int64_t sleep_ns = std::max<int64_t>(0, health_check_period_ns - elapsed_ns);
132
133 if (sleep_ns > 0)
134 {
135 std::this_thread::sleep_for(std::chrono::nanoseconds(sleep_ns));
136 }
137 }
138}
139
140/**
141 * @brief Analyze and log error codes
142 *
143 * Examines the current error codes and logs detailed information
144 * about any active errors or warnings.
145 */
146void HealthCheck::analyzeErrorCodes()
147{
148 uint32_t current_error_codes = vio_error_codes.load();
149
150 // Check for new errors
151 uint32_t new_errors = current_error_codes & ~last_error_codes_;
152 uint32_t cleared_errors = last_error_codes_ & ~current_error_codes;
153
154 if (new_errors != 0)
155 {
156 std::cerr << "[HEALTH] New errors detected: 0x" << std::hex << (int)new_errors << std::dec << std::endl;
157 printf("[DEBUG] Current error codes: 0x%x, New errors: 0x%x\n", (int)current_error_codes, (int)new_errors);
158
159 // Log specific error details
160 if (new_errors & ERROR_CODE_COVARIANCE)
161 {
162 std::cerr << "[HEALTH] ERROR: Covariance matrix not positive definite" << std::endl;
163 }
164 if (new_errors & ERROR_CODE_IMU_OOB)
165 {
166 std::cerr << "[HEALTH] ERROR: IMU exceeded range (out of bounds)" << std::endl;
167 }
168 if (new_errors & ERROR_CODE_IMU_BW)
169 {
170 std::cerr << "[HEALTH] ERROR: IMU bandwidth too low" << std::endl;
171 }
172 if (new_errors & ERROR_CODE_NOT_STATIONARY)
173 {
174 std::cerr << "[HEALTH] ERROR: System not stationary at initialization" << std::endl;
175 }
176 if (new_errors & ERROR_CODE_NO_FEATURES)
177 {
178 std::cerr << "[HEALTH] ERROR: No features for extended period" << std::endl;
179 }
180 if (new_errors & ERROR_CODE_CONSTRAINT)
181 {
182 std::cerr << "[HEALTH] ERROR: Insufficient constraints from features" << std::endl;
183 }
184 if (new_errors & ERROR_CODE_FEATURE_ADD)
185 {
186 std::cerr << "[HEALTH] ERROR: Failed to add new features" << std::endl;
187 }
188 if (new_errors & ERROR_CODE_VEL_INST_CERT)
189 {
190 std::cerr << "[HEALTH] ERROR: Exceeded instant velocity uncertainty" << std::endl;
191 }
192 if (new_errors & ERROR_CODE_VEL_WINDOW_CERT)
193 {
194 std::cerr << "[HEALTH] ERROR: Exceeded velocity uncertainty" << std::endl;
195 }
196 if (new_errors & ERROR_CODE_DROPPED_IMU)
197 {
198 std::cerr << "[HEALTH] WARNING: Dropped IMU samples" << std::endl;
199 }
200 if (new_errors & ERROR_CODE_BAD_CAM_CAL)
201 {
202 std::cerr << "[HEALTH] ERROR: Intrinsic camera calibration questionable" << std::endl;
203 }
204 if (new_errors & ERROR_CODE_LOW_FEATURES)
205 {
206 std::cerr << "[HEALTH] ERROR: Insufficient good features to initialize" << std::endl;
207 }
208 if (new_errors & ERROR_CODE_DROPPED_CAM)
209 {
210 std::cerr << "[HEALTH] WARNING: Dropped camera frame" << std::endl;
211 }
212 if (new_errors & ERROR_CODE_DROPPED_GPS_VEL)
213 {
214 std::cerr << "[HEALTH] WARNING: Dropped GPS velocity sample" << std::endl;
215 }
216 if (new_errors & ERROR_CODE_BAD_TIMESTAMP)
217 {
218 std::cerr << "[HEALTH] ERROR: Sensor measurements with bad timestamps" << std::endl;
219 printf("[DEBUG] Health check detected ERROR_CODE_BAD_TIMESTAMP\n");
220 }
221 if (new_errors & ERROR_CODE_IMU_MISSING)
222 {
223 std::cerr << "[HEALTH] ERROR: Missing IMU data" << std::endl;
224 }
225 if (new_errors & ERROR_CODE_CAM_MISSING)
226 {
227 std::cerr << "[HEALTH] ERROR: Missing camera frames" << std::endl;
228 }
229 if (new_errors & ERROR_CODE_CAM_BAD_RES)
230 {
231 std::cerr << "[HEALTH] ERROR: Camera resolution unsupported" << std::endl;
232 }
233 if (new_errors & ERROR_CODE_CAM_BAD_FORMAT)
234 {
235 std::cerr << "[HEALTH] ERROR: Camera format unsupported" << std::endl;
236 }
237 if (new_errors & ERROR_CODE_UNKNOWN)
238 {
239 std::cerr << "[HEALTH] ERROR: Unknown error" << std::endl;
240 }
241 if (new_errors & ERROR_CODE_STALLED)
242 {
243 std::cerr << "[HEALTH] ERROR: Frame processing stalled" << std::endl;
244 }
245 }
246
247 if (cleared_errors != 0)
248 {
249 std::cout << "[HEALTH] Errors cleared: 0x" << std::hex << (int)cleared_errors << std::dec << std::endl;
250 }
251
252 last_error_codes_ = current_error_codes;
253}
254
255/**
256 * @brief Check system connectivity
257 *
258 * Monitors the connection status of cameras and IMU, logging
259 * any disconnection events or connectivity issues.
260 */
261void HealthCheck::checkSystemConnectivity()
262{
263 // THIS IS A REDO OF THE PAST SYSTEM CONNECTIVITY CHECK INSIDE monitorSystemPerformance --> THIS IS A BETTER APPROACH
264 // Detect stale sensor data
265 const int64_t sensor_timeout_ns = 5000000000; // 5 second timeout --> MAYBE MAKE THIS SMALLER IF NEEDED BE
266 int64_t now_ns = _apps_time_monotonic_ns();
267 // If no new IMU data within timeout, mark IMU as disconnected
268 if (last_imu_timestamp_ns != 0 && now_ns - last_imu_timestamp_ns > sensor_timeout_ns)
269 {
270 if (is_imu_connected.load(std::memory_order_acquire))
271 {
272 std::cerr << "[HEALTH] IMU likely disconnected --> stale data (no data for "
273 << (now_ns - last_imu_timestamp_ns) / 1000000 << "ms)" << std::endl;
274 }
275 is_imu_connected.store(false, std::memory_order_release);
276 }
277 // If no new camera data within timeout, mark camera as disconnected
278 if (last_cam_time != 0 && now_ns - last_cam_time > sensor_timeout_ns)
279 {
280 if (is_cam_connected.load(std::memory_order_acquire))
281 {
282 std::cerr << "[HEALTH] Camera likely disconnected --> stale data (no data for "
283 << (now_ns - last_cam_time) / 1000000 << "ms)" << std::endl;
284 }
285 is_cam_connected.store(false, std::memory_order_release);
286 }
287
288 bool current_imu_connected = is_imu_connected.load(std::memory_order_acquire);
289 bool current_cam_connected = is_cam_connected.load(std::memory_order_acquire);
290
291 // Check IMU connection changes
292 if (current_imu_connected != last_imu_connected_)
293 {
294 if (current_imu_connected)
295 {
296 std::cout << "[HEALTH] IMU connected" << std::endl;
297 // Clear IMU-related errors when connection is restored --> CURRENTLY CLEANING ALL ERRORS
298 clearErrorCodes(0, true);
299 // Request reset upon IMU reconnection
300 reset_requested.store(true, std::memory_order_release);
301 std::cout << "[HEALTH] Reset requested due to IMU reconnection" << std::endl;
302 }
303 else
304 {
305 std::cerr << "[HEALTH] ERROR: IMU disconnected" << std::endl;
306 vio_error_codes |= ERROR_CODE_IMU_MISSING;
307 }
308 last_imu_connected_ = current_imu_connected;
309 }
310
311 // Check camera connection changes
312 if (current_cam_connected != last_cam_connected_)
313 {
314 if (current_cam_connected)
315 {
316 std::cout << "[HEALTH] Camera connected" << std::endl;
317 // Clear camera-related errors when connection is restored
318 // CAN PROBABLY CLEAR ALL ERRORS HERE...
319 clearErrorCodes(ERROR_CODE_CAM_MISSING | ERROR_CODE_DROPPED_CAM);
320
321 // Don't trigger a reset on the very first connection
322 if (first_camera_connection_seen_) {
323 // Request reset upon camera reconnection
324 reset_requested.store(true, std::memory_order_release);
325 std::cout << "[HEALTH] Reset requested due to camera reconnection" << std::endl;
326 } else {
327 first_camera_connection_seen_ = true; // no reset on first connection
328 }
329 }
330 else
331 {
332 std::cerr << "[HEALTH] ERROR: Camera disconnected" << std::endl;
333 vio_error_codes |= ERROR_CODE_CAM_MISSING;
334 }
335 last_cam_connected_ = current_cam_connected;
336 }
337
338 // Check VIO state changes
339 uint8_t current_vio_state = vio_state.load(std::memory_order_acquire);
340 if (current_vio_state != last_vio_state_)
341 {
342 std::cout << "[HEALTH] VIO state changed: " << (int)last_vio_state_ << " -> " << (int)current_vio_state << std::endl;
343 last_vio_state_ = current_vio_state;
344 }
345}
346
347/**
348 * @brief Monitor system performance
349 *
350 * Tracks system performance metrics including processing rates,
351 * memory usage, and timing statistics.
352 */
353void HealthCheck::monitorSystemPerformance()
354{
355 static int64_t last_performance_log_ns = 0;
356 int64_t current_time_ns = _apps_time_monotonic_ns();
357
358 // Log performance metrics every 5 seconds
359 if (current_time_ns - last_performance_log_ns > 5000000000)
360 { // 5 seconds
361 std::cout << "[HEALTH] Performance - Health checks: " << health_check_count_
362 << ", IMU timestamp: " << last_imu_timestamp_ns
363 << ", Camera timestamp: " << last_cam_time << std::endl;
364
365 last_performance_log_ns = current_time_ns;
366 health_check_count_ = 0; // Reset counter
367 }
368}
369
370/**
371 * @brief Check auto-reset conditions
372 *
373 * Evaluates whether auto-reset conditions are met based on
374 * current system state and error conditions.
375 */
376void HealthCheck::checkAutoResetConditions()
377{
378 if (!en_auto_reset)
379 {
380 return; // Auto-reset disabled
381 }
382
383 // Suppress auto-reset for a grace period after a hard reset to allow sensors to come back online
384 int64_t now = _apps_time_monotonic_ns();
385 if (now - time_of_last_reset < INIT_FAILURE_TIMEOUT_NS)
386 {
387 return;
388 }
389
390 // Also skip auto-reset logic while the VIO manager is still initializing. OpenVINS may
391 // require several seconds of IMU/vision data and heavy optimization before the
392 // "initialized()" flag is set; triggering another reset in that window leads to a loop.
393 if (!vio_manager || !vio_manager->initialized())
394 {
395 return;
396 }
397
398 uint32_t current_error_codes = vio_error_codes.load();
399
400 if (current_error_codes != 0)
401 {
402 std::cerr << "[HEALTH] AUTO-RESET RECOMMENDED: Error code(s) detected: 0x" << std::hex << (int)current_error_codes << std::dec << std::endl;
403 clearErrorCodes(0, true);
404 // Set reset flag (this would trigger reset in main loop)
405 reset_requested.store(true, std::memory_order_release);
406 }
407}
408
409/**
410 * @brief Check for VINS reset request
411 *
412 * This function checks if a reset has been requested and handles the reset process.
413 * It ensures that only one reset operation can be in progress at a time.
414 */
415void HealthCheck::checkVINSResetRequest()
416{
417 // atomically check if a reset has been requested, if not, return
418 if (!reset_requested.exchange(false, std::memory_order_acq_rel))
419 return;
420
421 // check time since last reset
422 int64_t current_time = _apps_time_monotonic_ns();
423 uint64_t time_since_reset = current_time - time_of_last_reset;
424 if (time_since_reset <= INIT_FAILURE_TIMEOUT_NS)
425 {
426 std::cout << "[HEALTH] Reset requested but last reset was too recent ("
427 << (time_since_reset / 1000000) << "ms ago), ignoring request" << std::endl;
428 return;
429 }
430
431 // If reset is requested, check if we are already resetting
432 if (is_resetting.exchange(true, std::memory_order_acq_rel))
433 {
434 std::cout << "[HEALTH] Reset already in progress, ignoring request\n";
435 return;
436 }
437
438 if (en_debug)
439 std::cout << "[HEALTH] Reset requested, preparing to reset VIO system" << std::endl;
440
441 int rc = 0;
442 try
443 {
444 rc = doHardReset();
445 reset_num_counter.fetch_add(1, std::memory_order_acq_rel);
446 }
447 catch (const std::exception &e)
448 {
449 fprintf(stderr, "[ERROR] Exception during reset: %s\n", e.what());
450 // Check if it's a permission error
451 if (strstr(e.what(), "Operation not permitted") != nullptr)
452 {
453 fprintf(stderr, "[ERROR] Permission denied during reset - this may be due to insufficient privileges\n");
454 }
455 rc = -1;
456 }
457
458 if (rc == 0)
459 {
460 std::cout << "[HEALTH] VIO system reset successfully" << std::endl;
461
462 // CRITICAL: Clear VIO state back to INITIALIZING after successful reset
463 vio_state.store(VIO_STATE_INITIALIZING, std::memory_order_release);
464 std::cout << "[HEALTH] VIO state set to INITIALIZING after reset" << std::endl;
465
466 // Clear last sensor timestamps; they will be filled when fresh data arrives
468 last_cam_time = 0;
469 }
470 else
471 {
472 std::cerr << "[HEALTH] VIO system reset failed with code: " << rc << std::endl;
473 // Even on failure, set to INITIALIZING so system can try again
474 vio_state.store(VIO_STATE_INITIALIZING, std::memory_order_release);
475 // Clear reset flags even on failure to prevent getting stuck --> PRIME MOVE HERE
476 reset_requested.store(false, std::memory_order_release);
477 }
478
479 time_of_last_reset = _apps_time_monotonic_ns();
480
481 is_resetting.store(false, std::memory_order_release);
482 reset_cv.notify_all();
483 return;
484}
485
486int HealthCheck::doHardReset()
487{
488 // wait until all callbacks have finished processing
489 {
490 std::unique_lock<std::mutex> lk(reset_mtx);
491 // Add timeout to prevent infinite blocking --> FOR NOW, 5 SECONDS
492 bool wait_result = reset_cv.wait_for(lk, std::chrono::seconds(5),
493 [this]
494 {
495 auto cur = active_callbacks.load(std::memory_order_acquire);
496 return cur == 0;
497 });
498
499 if (!wait_result)
500 {
501 fprintf(stderr, "[ERROR] Timeout waiting for callbacks to finish during reset. active_callbacks=%d\n",
502 active_callbacks.load(std::memory_order_acquire));
503 return -1;
504 }
505 }
507 clearErrorCodes(0, true);
508 printf("[HEALTH] Hard reset in progress\n");
509
510 // ensure we have a valid and initialized VIO manager; if not, create one directly
511 if (!vio_manager && !vio_manager->initialized())
512 {
513 if (en_debug)
514 std::cout << "[HEALTH] VIO manager was uninitialized, creating a fresh instance" << std::endl;
515
516 try
517 {
518 vio_manager = std::make_unique<ov_msckf::VioManager>(vio_manager_options);
519 }
520 catch (const std::exception &e)
521 {
522 fprintf(stderr, "[ERROR] Failed to create VIO manager during reset: %s\n", e.what());
523 return -1;
524 }
525
526 return 0; // fresh manager created, nothing else to reset
527 }
528
529 // Create references for old and new VIO manager
530 std::unique_ptr<ov_msckf::VioManager> old_vio_manager;
531 std::unique_ptr<ov_msckf::VioManager> new_vio_manager;
532
533 try
534 {
535 new_vio_manager = std::make_unique<ov_msckf::VioManager>(vio_manager_options);
536
537 if (!new_vio_manager)
538 {
539 fprintf(stderr, "[ERROR] Failed to create new VIO manager object\n");
540 throw std::runtime_error("Failed to create new VIO manager");
541 }
542
543 old_vio_manager = std::move(vio_manager);
544 vio_manager = std::move(new_vio_manager);
545 }
546 catch (const std::exception &e)
547 {
548 fprintf(stderr, "[ERROR] Exception during VIO manager creation: %s\n", e.what());
549 if (old_vio_manager)
550 {
551 vio_manager = std::move(old_vio_manager); // restore previous manager
552 }
553 else
554 {
555 std::cerr << "[HEALTH] Warning: no previous VIO manager to restore" << std::endl;
556 }
557 return -1;
558 }
559
560 // destroy old VIO manager
561 old_vio_manager.reset();
562
563 return 0;
564}
565
566/**
567 * @brief Clear specific error codes
568 *
569 * Clears the specified error codes from the global error state.
570 * This is useful when errors are resolved and should no longer
571 * be reported.
572 *
573 * @param error_mask Bit mask of error codes to clear
574 */
575void HealthCheck::clearErrorCodes(uint32_t error_mask, bool clear_all)
576{
577 if (clear_all)
578 {
579 vio_error_codes.store(0, std::memory_order_release);
580 }
581 else
582 {
583 uint32_t current_errors = vio_error_codes.load(std::memory_order_acquire);
584 uint32_t new_errors = current_errors & ~error_mask;
585 vio_error_codes.store(new_errors, std::memory_order_release);
586 }
587
588 if (en_debug)
589 {
590 std::cout << "[HEALTH] Cleared error codes: 0x" << std::hex << (int)error_mask << std::dec << std::endl;
591 }
592}
Housekeeping and data publishing for VOXL OpenVINS.
volatile int64_t last_cam_time
Timestamp of last camera data (nanoseconds)
Definition VoxlVars.cpp:199
volatile int64_t last_imu_timestamp_ns
Timestamp of last IMU data (nanoseconds)
Definition VoxlVars.cpp:177
volatile int main_running
Main process running flag.
Definition VoxlVars.cpp:38
std::atomic< uint32_t > active_callbacks
Number of callbacks inside the system.
Definition VoxlVars.cpp:56
ov_msckf::VioManagerOptions vio_manager_options
VIO manager options.
Definition VoxlVars.cpp:28
int en_auto_reset
Enable automatic reset functionality.
Definition VoxlVars.cpp:78
std::mutex reset_mtx
Mutex used by reset thread.
Definition VoxlVars.cpp:59
std::atomic< uint32_t > reset_num_counter
Counter which increments on resets.
Definition VoxlVars.cpp:53
std::unique_ptr< ov_msckf::VioManager > vio_manager
Main VIO manager instance.
Definition VoxlVars.cpp:31
int en_debug
Enable debug output.
Definition VoxlVars.cpp:148
std::condition_variable reset_cv
Reset conditional variable.
Definition VoxlVars.cpp:62
std::atomic< bool > is_resetting
VIO reset state flag.
std::atomic< uint8_t > vio_state
Current VIO system state.
std::atomic< uint32_t > vio_error_codes
VIO error codes.
std::atomic< bool > is_imu_connected
IMU connection state.
std::atomic< bool > reset_requested
Should reset floag.
std::atomic< bool > is_cam_connected
Camera connection state.
void start()
Start the health check system.
static void clearErrorCodes(uint32_t error_mask)
Clear specific error codes.
Definition VoxlHK.h:334
void stop()
Stop the health check system.
void set_first_packet(bool first_packet_)
Set the first packet flag.
Definition VoxlHK.h:235
static Publisher & getInstance()
Get singleton instance.
Definition VoxlHK.h:142
Main namespace for VOXL OpenVINS server components.