From 00d1fb62b9643c5e9872304d04c700c68f7bd3fa Mon Sep 17 00:00:00 2001
From: Libres-coder <2597242922@qq.com>
Date: Sun, 21 Sep 2025 05:25:13 +0800
Subject: [PATCH] feat: enhance build failure diagnosis and automatic
troubleshooting capabilities
---
container/service-health/html/diagnosis.html | 351 ++++++++++++
container/service-health/html/index.html | 222 ++++----
.../service-health/html/job_details.html | 248 +++++++++
container/web-backend/web-backend | 245 +++++++++
etc/compass-ci/diagnosis/config.yaml | 168 ++++++
lib/job_diagnosis.rb | 421 +++++++++++++++
lib/job_diagnosis_api.rb | 502 ++++++++++++++++++
lib/job_diagnosis_library.rb | 378 +++++++++++++
lib/job_diagnosis_monitoring.rb | 308 +++++++++++
src/lib/web_backend.rb | 181 ++++++-
10 files changed, 2906 insertions(+), 118 deletions(-)
create mode 100644 container/service-health/html/diagnosis.html
create mode 100644 container/service-health/html/job_details.html
create mode 100644 etc/compass-ci/diagnosis/config.yaml
create mode 100644 lib/job_diagnosis.rb
create mode 100644 lib/job_diagnosis_api.rb
create mode 100644 lib/job_diagnosis_library.rb
create mode 100644 lib/job_diagnosis_monitoring.rb
diff --git a/container/service-health/html/diagnosis.html b/container/service-health/html/diagnosis.html
new file mode 100644
index 000000000..73851da1b
--- /dev/null
+++ b/container/service-health/html/diagnosis.html
@@ -0,0 +1,351 @@
+
+
+
+
+ Build Diagnosis
+
+
+
+ Compass CI Build Failure Diagnosis Center
+
+
+
+
+
+ System Health Status
+
+ |
+ Diagnosis Function
+ |
+ Checking... |
+
+
+ |
+ Error Type Library
+ |
+ Loading... |
+
+
+ |
+ System Status
+ |
+ Checking... |
+
+
+
+
+
+
+
+
+
+
+ Diagnosis Result
+
+ |
+ Job ID
+ |
+ |
+
+
+ |
+ Job State
+ |
+ |
+
+
+ |
+ Error Type
+ |
+ |
+
+
+ |
+ Possible Causes
+ |
+ |
+
+
+ |
+ Suggested Solutions
+ |
+ |
+
+
+
+
+
+
+
+
+
+
+ Troubleshooting Guidance
+
+ |
+ Error Description
+ |
+ |
+
+
+ |
+ Troubleshooting Steps
+ |
+ |
+
+
+
+
+
+
+
+ Common Error Types
+
+ | Error Type |
+ Typical Symptoms |
+ Common Solutions |
+
+
+ | build_dependency_failed |
+ Package installation failure |
+ Check package versions and source configuration |
+
+
+ | test_execution_error |
+ Test execution interruption |
+ Check test environment and test scripts |
+
+
+ | testbox_error |
+ Test machine failure |
+ Check test machine status and network connection |
+
+
+ | scheduler_error |
+ Scheduler exception |
+ Check scheduling queue and resource allocation |
+
+
+ | timeout_error |
+ Execution timeout |
+ Check execution time limits and system load |
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/container/service-health/html/index.html b/container/service-health/html/index.html
index aa78fe05b..12ee09b27 100644
--- a/container/service-health/html/index.html
+++ b/container/service-health/html/index.html
@@ -1,108 +1,114 @@
-
-
-
-
-
- Health Monitor
-
-
-
-
- Compass CI 服务健康监视器
-
-
-
- |
- git mirror
- |
-
-
- machine list
- |
-
-
- log errors
- |
-
-
-
- |
- job stderrs
- |
-
- jobs boot time
- |
-
- srpm list
- |
-
-
-
- |
- no service
- |
-
- no service
- |
-
- no service
- |
-
-
-
-
-
-
+
+
+
+
+
+ Health Monitor
+
+
+
+
+ Compass CI 服务健康监视器
+
+
+
+ |
+ git mirror
+ |
+
+
+ machine list
+ |
+
+
+ log errors
+ |
+
+
+
+ |
+ job stderrs
+ |
+
+ jobs boot time
+ |
+
+ srpm list
+ |
+
+
+
+ |
+ build diagnosis
+ |
+
+ job details
+ |
+
+ no service
+ |
+
+
+
+
+
+
diff --git a/container/service-health/html/job_details.html b/container/service-health/html/job_details.html
new file mode 100644
index 000000000..07d411dd5
--- /dev/null
+++ b/container/service-health/html/job_details.html
@@ -0,0 +1,248 @@
+
+
+
+
+ Job Details
+
+
+
+ Job Details and Diagnosis
+
+
+
+
+
+
+
+
+ Job Information
+
+ |
+ Job ID
+ |
+ |
+
+
+ |
+ Job State
+ |
+ |
+
+
+ |
+ Submit Time
+ |
+ |
+
+
+ |
+ Test Suite
+ |
+ |
+
+
+ |
+ Operating System
+ |
+ |
+
+
+ |
+ Test Box
+ |
+ |
+
+
+
+
+
+
+
+
+
+
+ Common Job State Description
+
+ | State |
+ Meaning |
+ Handling Suggestions |
+
+
+ | failed |
+ Job execution failed |
+ Check error logs, verify configuration and environment |
+
+
+ | incomplete |
+ Job not completed |
+ Check execution time and resource limits |
+
+
+ | timeout |
+ Execution timeout |
+ Check system load and time limit settings |
+
+
+ | running |
+ Currently executing |
+ Wait for completion or check execution progress |
+
+
+ | finished |
+ Completed normally |
+ View execution results and generated reports |
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/container/web-backend/web-backend b/container/web-backend/web-backend
index 791d044ab..5bed32528 100755
--- a/container/web-backend/web-backend
+++ b/container/web-backend/web-backend
@@ -66,6 +66,13 @@ get '/web_backend/get_jobs' do
get_jobs(params)
end
+# GET /web_backend/job_info/:job_id
+# Get detailed information for a single job (including diagnosis information)
+# Response: Detailed information for a single job, including complete diagnosis data
+get '/web_backend/job_info/:job_id' do
+ get_single_job_info(params.merge(job_id: params[:job_id]))
+end
+
# GET /web_backend/active_testbox
# return to testbox that are active within 30 minutes
get '/web_backend/active_testbox' do
@@ -507,3 +514,241 @@ end
get '/user_auth/get_client_info' do
client_info()
end
+
+# ==================== Diagnosis Feature API Routes ====================
+# The following routes are only available when diagnosis feature is enabled
+
+# GET /diagnosis/status
+# Response: Diagnosis feature status information
+get '/diagnosis/status' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_status)
+ diagnosis_status
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Diagnosis feature not available', 'enabled' => ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true'}.to_json]
+ end
+end
+
+# GET /diagnosis/health
+# Response: Diagnosis feature health check results
+get '/diagnosis/health' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_health_check)
+ diagnosis_health_check
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Diagnosis health check not available'}.to_json]
+ end
+end
+
+# GET /diagnosis/error_types
+# Response: List of supported error types
+get '/diagnosis/error_types' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_error_types)
+ diagnosis_error_types
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Diagnosis error types not available'}.to_json]
+ end
+end
+
+# GET /diagnosis/error_guidance/:error_type
+# Response: Guidance information for specific error types
+get '/diagnosis/error_guidance/:error_type' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_error_guidance)
+ diagnosis_error_guidance(params.merge(error_type: params[:error_type]))
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Diagnosis error guidance not available'}.to_json]
+ end
+end
+
+# GET /diagnosis/capabilities
+# Response: Diagnosis feature integrity check
+get '/diagnosis/capabilities' do
+ capabilities = {
+ 'diagnosis_enabled' => ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true',
+ 'available_features' => [],
+ 'api_endpoints' => [],
+ 'system_status' => 'unknown'
+ }
+
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true'
+ # Check availability of each functional module
+ capabilities['available_features'] << 'error_identification' if respond_to?(:enhance_job_info_with_diagnosis)
+ capabilities['available_features'] << 'health_check' if respond_to?(:diagnosis_health_check)
+ capabilities['available_features'] << 'error_guidance' if respond_to?(:diagnosis_error_guidance)
+ capabilities['available_features'] << 'status_monitoring' if respond_to?(:diagnosis_status)
+ capabilities['available_features'] << 'user_feedback' if respond_to?(:record_diagnosis_feedback)
+
+ # Available API endpoints
+ capabilities['api_endpoints'] = [
+ '/diagnosis/status',
+ '/diagnosis/health',
+ '/diagnosis/error_types',
+ '/diagnosis/error_guidance/:type',
+ '/diagnosis/capabilities',
+ '/diagnosis/feedback',
+ '/diagnosis/quality_report'
+ ]
+
+ # System status
+ capabilities['system_status'] = capabilities['available_features'].length > 0 ? 'operational' : 'limited'
+ else
+ capabilities['system_status'] = 'disabled'
+ end
+
+ [200, headers.merge('Access-Control-Allow-Origin' => '*'), capabilities.to_json]
+end
+
+# POST /diagnosis/diagnose
+# Diagnose specific job
+post '/diagnosis/diagnose' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_diagnose_job)
+ request.body.rewind
+ begin
+ data = JSON.parse(request.body.read)
+ job_id = data['job_id']
+
+ if job_id.nil? || job_id.strip.empty?
+ [400, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'job_id is required'}.to_json]
+ else
+ result = diagnosis_diagnose_job(job_id.strip)
+
+ if result
+ [200, headers.merge('Access-Control-Allow-Origin' => '*'), result.to_json]
+ else
+ [500, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Failed to diagnose job'}.to_json]
+ end
+ end
+ rescue JSON::ParserError
+ [400, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Invalid JSON format'}.to_json]
+ rescue StandardError => e
+ [500, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => "Diagnosis failed: #{e.message}"}.to_json]
+ end
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Diagnosis service not available'}.to_json]
+ end
+end
+
+# POST /diagnosis/start_wizard
+# Start interactive diagnosis wizard
+post '/diagnosis/start_wizard' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_start_wizard)
+ request.body.rewind
+ wizard_params = JSON.parse(request.body.read) rescue {}
+ diagnosis_start_wizard(wizard_params.transform_keys(&:to_sym))
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'Diagnosis wizard not available' }.to_json]
+ end
+end
+
+# POST /diagnosis/wizard_step
+# Process wizard steps
+post '/diagnosis/wizard_step' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_wizard_step)
+ request.body.rewind
+ step_params = JSON.parse(request.body.read) rescue {}
+ diagnosis_wizard_step(step_params.transform_keys(&:to_sym))
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'Diagnosis wizard not available' }.to_json]
+ end
+end
+
+# GET /diagnosis/generate_report/:job_id
+# Generate comprehensive diagnosis report
+get '/diagnosis/generate_report/:job_id' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_generate_report)
+ diagnosis_generate_report(params.merge(job_id: params[:job_id]))
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'Diagnosis report generation not available' }.to_json]
+ end
+end
+
+# GET /diagnosis/monitoring/status
+# Response: Monitoring system status information
+get '/diagnosis/monitoring/status' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_monitoring_status)
+ diagnosis_monitoring_status
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'Monitoring status not available' }.to_json]
+ end
+end
+
+# GET /diagnosis/monitoring/health
+# Response: System health monitoring data
+get '/diagnosis/monitoring/health' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_system_health)
+ diagnosis_system_health
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'System health monitoring not available' }.to_json]
+ end
+end
+
+# GET /diagnosis/monitoring/metrics
+# Response: Current health metrics data
+get '/diagnosis/monitoring/metrics' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_health_metrics)
+ diagnosis_health_metrics
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ { 'error' => 'Health metrics collection not available' }.to_json]
+ end
+end
+
+# POST /diagnosis/feedback
+# Receive user feedback on diagnosis results
+post '/diagnosis/feedback' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:record_diagnosis_feedback)
+ request.body.rewind
+ feedback_data = JSON.parse(request.body.read) rescue {}
+ result = record_diagnosis_feedback(feedback_data)
+
+ if result
+ [200, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'status' => 'success', 'message' => 'Feedback recorded successfully'}.to_json]
+ else
+ [400, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'status' => 'error', 'message' => 'Failed to record feedback'}.to_json]
+ end
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Feedback system not available'}.to_json]
+ end
+end
+
+# GET /diagnosis/quality_report
+# Get diagnosis quality report
+get '/diagnosis/quality_report' do
+ if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:get_diagnosis_quality_report)
+ report = get_diagnosis_quality_report
+
+ if report
+ [200, headers.merge('Access-Control-Allow-Origin' => '*'), report.to_json]
+ else
+ [500, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Failed to generate quality report'}.to_json]
+ end
+ else
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Quality report not available'}.to_json]
+ end
+end
+
+# OPTIONS support for cross-domain requests
+options '/diagnosis/*' do
+ [200, headers.merge({
+ 'Access-Control-Allow-Origin' => '*',
+ 'Access-Control-Allow-Methods' => 'GET,POST,OPTIONS',
+ 'Access-Control-Allow-Headers' => 'Content-Type,Authorization'
+ }), '']
+end
diff --git a/etc/compass-ci/diagnosis/config.yaml b/etc/compass-ci/diagnosis/config.yaml
new file mode 100644
index 000000000..8d04339a4
--- /dev/null
+++ b/etc/compass-ci/diagnosis/config.yaml
@@ -0,0 +1,168 @@
+# Compass CI 作业诊断功能配置文件
+# 用于配置诊断功能的行为和参数
+
+# 诊断功能开关(也可通过环境变量 COMPASS_DIAGNOSIS_ENABLED 控制)
+enabled: true
+
+# 诊断功能版本
+version: "1.0.0"
+
+# 配置文件版本
+config_version: "1.1.0"
+
+# 功能特性开关
+feature_flags:
+ # 启用多难度级别支持
+ enable_difficulty_levels: true
+
+ # 启用智能推荐
+ enable_smart_recommendations: true
+
+ # 启用互动式向导
+ enable_interactive_wizard: true
+
+ # 启用实时帮助
+ enable_contextual_help: true
+
+# 需要增强诊断信息的API端点
+endpoints:
+ - '/web_backend/get_jobs'
+ - '/web_backend/job_info'
+ - '/web_backend/get_job'
+ - '/web_backend/compare'
+
+# 诊断功能配置
+diagnosis:
+ # 是否在响应中包含诊断统计信息
+ include_statistics: true
+
+ # 是否在日志中记录诊断操作
+ log_diagnosis_operations: false
+
+ # 最大诊断处理时间(毫秒)
+ max_processing_time_ms: 100
+
+ # 诊断信息缓存时间(秒)
+ cache_duration_seconds: 300
+
+# 帮助中心配置
+help_center:
+ base_url: 'https://compass-ci.readthedocs.io'
+ troubleshooting_path: '/troubleshooting/'
+ build_failure_guide: '/docs/build-failure-troubleshooting.html'
+ timeout_guide: '/docs/timeout-issues.html'
+
+# 错误严重程度配置
+severity_levels:
+ high:
+ - 'runtime_crash'
+ - 'out_of_memory'
+ - 'boot_failure'
+ medium:
+ - 'execution_timeout'
+ - 'disk_failure'
+ - 'download_failure'
+ low:
+ - 'build_failure'
+ - 'install_failure'
+ - 'abnormal_termination'
+
+# 预估修复时间配置(分钟)
+estimated_fix_times:
+ build_failure: 10
+ install_failure: 15
+ execution_timeout: 20
+ download_failure: 25
+ runtime_crash: 30
+ out_of_memory: 35
+ boot_failure: 45
+ disk_failure: 50
+
+# 诊断模块配置
+modules:
+ # 核心诊断引擎
+ core_diagnosis: true
+
+ # Web后端中间件
+ web_middleware: true
+
+ # 独立API服务(未来扩展)
+ standalone_api: false
+
+ # 健康检查模块
+ health_check: true
+
+# 性能配置
+performance:
+ # 是否启用诊断缓存
+ enable_caching: true
+
+ # 缓存过期时间(秒)
+ cache_ttl: 300
+
+ # 最大并发诊断任务数
+ max_concurrent_diagnoses: 10
+
+ # 诊断超时时间(毫秒)
+ diagnosis_timeout_ms: 500
+
+# 安全配置
+security:
+ # 是否在响应中包含详细的错误堆栈
+ include_error_stack: false
+
+ # 是否允许外部配置覆盖
+ allow_config_override: true
+
+ # 信任的IP地址列表(用于管理接口)
+ trusted_ips: ["127.0.0.1", "::1"]
+
+# 监控配置
+monitoring:
+ # 是否启用诊断指标收集
+ enable_metrics: true
+
+ # 指标收集间隔(秒)
+ metrics_interval: 60
+
+ # 是否记录诊断性能统计
+ track_performance: true
+
+# 用户体验个性化配置
+user_experience:
+ # 默认难度级别
+ default_difficulty_level: 'intermediate'
+
+ # 是否启用智能难度识别
+ smart_difficulty_detection: true
+
+ # 不同技术水平的用户配置
+ difficulty_levels:
+ beginner:
+ show_detailed_explanations: true
+ include_terminology_help: true
+ provide_visual_aids: true
+ estimated_time_multiplier: 1.5
+
+ intermediate:
+ show_key_points: true
+ highlight_common_pitfalls: true
+ provide_best_practices: true
+ estimated_time_multiplier: 1.0
+
+ expert:
+ show_concise_info: true
+ enable_advanced_options: true
+ suggest_automation: true
+ estimated_time_multiplier: 0.7
+
+# 自适应学习配置
+adaptive_learning:
+ # 是否启用用户行为学习
+ enable_user_behavior_learning: false
+
+ # 是否根据成功率调整建议
+ adjust_recommendations: true
+
+ # 是否记录用户偏好
+ track_user_preferences: false
\ No newline at end of file
diff --git a/lib/job_diagnosis.rb b/lib/job_diagnosis.rb
new file mode 100644
index 000000000..7e10c31e2
--- /dev/null
+++ b/lib/job_diagnosis.rb
@@ -0,0 +1,421 @@
+# SPDX-License-Identifier: MulanPSL-2.0+
+# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
+# frozen_string_literal: true
+
+# Compass CI job diagnosis core module
+# Core diagnosis logic with minimal dependencies
+
+require_relative 'job_diagnosis_library'
+
+def diagnosis_enabled?
+ ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true'
+end
+
+def diagnose_job(job_data)
+ return job_data unless diagnosis_enabled?
+ return job_data unless valid_job_data?(job_data)
+ return job_data unless needs_diagnosis?(job_data)
+
+ diagnosis_info = analyze_job_error(job_data)
+ return job_data unless diagnosis_info.is_a?(Hash)
+
+ job_data.merge('diagnosis' => diagnosis_info)
+rescue StandardError => e
+ log_diagnosis_error(e, 'diagnose_job') if defined?(log_error)
+ job_data
+end
+
+def diagnose_jobs(jobs)
+ return jobs unless diagnosis_enabled?
+ return jobs unless jobs.is_a?(Array) && !jobs.empty?
+
+ return jobs if system_overloaded?
+
+ max_size = calculate_safe_batch_size(jobs.length)
+
+ if jobs.length > max_size
+ # Prioritize failed jobs for diagnosis
+ priority_jobs, normal_jobs = prioritize_jobs_for_diagnosis(jobs)
+
+ # Process high priority jobs first
+ priority_diagnosed = priority_jobs.first(max_size / 2).map { |job| diagnose_job_with_timeout(job) }
+ remaining_quota = max_size - priority_diagnosed.length
+
+ if remaining_quota > 0
+ normal_diagnosed = normal_jobs.first(remaining_quota).map { |job| diagnose_job_with_timeout(job) }
+ priority_diagnosed + normal_diagnosed + jobs[max_size..-1]
+ else
+ priority_diagnosed + jobs[priority_diagnosed.length..-1]
+ end
+ else
+ jobs.map { |job| diagnose_job_with_timeout(job) }
+ end
+rescue StandardError => e
+ log_diagnosis_error(e, 'diagnose_jobs') if defined?(log_error)
+ jobs
+end
+
+def system_overloaded?
+ begin
+ if RUBY_PLATFORM.match?(/win32|mingw|mswin/)
+ # Windows: check CPU usage
+ cpu_usage = `wmic cpu get loadpercentage /value`.match(/LoadPercentage=(\d+)/)[1].to_f
+ cpu_usage > 80.0 # Consider overloaded if CPU usage > 80%
+ else
+ # Linux/Unix: check load average
+ load_avg = `uptime`.match(/load average: ([\d\.]+)/)[1].to_f
+ load_avg > 5.0 # Consider overloaded if load > 5.0
+ end
+ rescue StandardError
+ false # Consider normal if load check fails
+ end
+end
+
+def calculate_safe_batch_size(total_jobs)
+ base_size = 50
+
+ # Dynamically adjust based on total job count
+ if total_jobs > 1000
+ base_size = 30
+ elsif total_jobs > 500
+ base_size = 40
+ end
+
+ # Check available memory - cross-platform compatible
+ begin
+ if RUBY_PLATFORM.match?(/win32|mingw|mswin/)
+ # Windows: use wmic to get available memory
+ available_mem = `wmic OS get FreePhysicalMemory /value`.match(/FreePhysicalMemory=(\d+)/)[1].to_i / 1024
+ else
+ # Linux/Unix: use free command
+ mem_info = `free -m`.lines[1].split
+ available_mem = mem_info[6].to_i
+ end
+
+ if available_mem < 500 # Available memory < 500MB
+ base_size = [base_size / 2, 10].max
+ end
+ rescue StandardError
+ # Memory check failed, use conservative value
+ base_size = 20
+ end
+
+ base_size
+end
+
+def prioritize_jobs_for_diagnosis(jobs)
+ priority_states = %w[failed incomplete timeout execution_stuck]
+
+ priority_jobs = jobs.select { |job| priority_states.include?(job['job_state']) }
+ normal_jobs = jobs - priority_jobs
+
+ [priority_jobs, normal_jobs]
+end
+
+def diagnose_job_with_timeout(job_data, timeout_seconds = 5)
+ require 'timeout'
+
+ Timeout.timeout(timeout_seconds) do
+ diagnose_job(job_data)
+ end
+rescue Timeout::Error
+ log_diagnosis_error(StandardError.new('Diagnosis timeout'), 'diagnose_job_timeout') if defined?(log_error)
+ job_data
+rescue StandardError => e
+ log_diagnosis_error(e, 'diagnose_job_with_timeout') if defined?(log_error)
+ job_data
+end
+
+def enhance_response_with_diagnosis(response)
+ return response unless diagnosis_enabled?
+ return response unless response.is_a?(Hash)
+
+ response.merge('diagnosis_available' => true, 'diagnosis_version' => '1.0.0')
+rescue StandardError => e
+ log_diagnosis_error(e, 'enhance_response') if defined?(log_error)
+ response
+end
+
+def valid_job_data?(job_data)
+ job_data.is_a?(Hash) && !job_data.empty? && job_data['job_state'].is_a?(String)
+end
+
+def needs_diagnosis?(job_data)
+ job_state = job_data['job_state']
+
+ success_states = %w[finished complete extract_result_finished extract_stats_finished]
+ return false if success_states.include?(job_state)
+
+ running_states = %w[running queued post_run]
+ if running_states.include?(job_state)
+ return check_if_stuck?(job_data, job_state)
+ end
+
+ true
+end
+
+def check_if_stuck?(job_data, job_state)
+ return false unless job_data['submit_time']
+
+ begin
+ submit_time = Time.parse(job_data['submit_time'])
+ current_time = Time.now
+ elapsed_time = current_time - submit_time
+
+ timeout_thresholds = {
+ 'queued' => 3600,
+ 'running' => 7200,
+ 'post_run' => 1800
+ }
+
+ threshold = timeout_thresholds[job_state] || 1800
+ elapsed_time > threshold
+ rescue StandardError
+ false
+ end
+end
+
+def analyze_job_error(job_data)
+ job_state = job_data['job_state']
+ stats = extract_stats_data(job_data)
+ error_ids = extract_error_ids_data(job_data)
+
+ error_type = detect_error_type(job_state, stats, error_ids, job_data)
+ error_info = DIAGNOSIS_LIBRARY[error_type] || default_error_info
+
+ build_diagnosis_result(job_data, job_state, error_type, error_info)
+end
+
+def extract_stats_data(job_data)
+ stats = job_data['stats'] || job_data.dig('_source', 'stats')
+ stats.is_a?(Hash) ? stats : {}
+end
+
+def extract_error_ids_data(job_data)
+ error_ids = job_data['error_ids'] || job_data.dig('_source', 'error_ids')
+ error_ids.is_a?(Array) ? error_ids : []
+end
+
+def build_diagnosis_result(job_data, job_state, error_type, error_info)
+ {
+ 'status' => 'analyzed',
+ 'error_type' => error_type,
+ 'original_job_state' => job_state,
+ 'title' => error_info[:title],
+ 'category' => error_info[:category],
+ 'severity' => error_info[:severity],
+ 'possible_causes' => error_info[:causes],
+ 'suggested_solutions' => customize_solutions(error_info[:solutions], job_data),
+ 'quick_fixes' => customize_solutions(error_info[:quick_fixes], job_data),
+ 'job_id' => job_data['id'],
+ 'result_root' => job_data['result_root'],
+ 'analyzed_at' => Time.now.to_s,
+ 'confidence_score' => calculate_confidence_score(job_state, job_data)
+ }
+end
+
+def detect_error_type(job_state, stats, error_ids, job_data)
+ if error_ids && !error_ids.empty?
+ precise_type = analyze_error_ids_patterns(error_ids)
+ return precise_type if precise_type != 'unknown'
+ end
+
+ if stats && !stats.empty?
+ stats_type = analyze_stats_patterns(stats)
+ return stats_type if stats_type != 'unknown'
+ end
+
+ return analyze_job_state_patterns(job_state, job_data)
+end
+
+def analyze_error_ids_patterns(error_ids)
+ return 'unknown' unless error_ids.is_a?(Array)
+
+ # Build dependency errors
+ if error_ids.any? { |id| id.match?(/dependency|missing|require/) }
+ return 'build_dependency_error'
+ end
+
+ # Compile errors
+ if error_ids.any? { |id| id.match?(/compile|gcc|build-pkg\..*error/) }
+ return 'build_compile_error'
+ end
+
+ # Spec file errors
+ if error_ids.any? { |id| id.match?(/spec|pkgbuild/) }
+ return 'build_spec_error'
+ end
+
+ # Test execution errors
+ if error_ids.any? { |id| id.match?(/test|check|verify/) }
+ return 'test_execution_error'
+ end
+
+ # Has error_ids but cannot identify specific type
+ return 'build_unknown_error' if !error_ids.empty?
+
+ 'unknown'
+end
+
+def analyze_stats_patterns(stats)
+ return 'unknown' unless stats.is_a?(Hash)
+
+ # stderr.exit_fail is the most important failure indicator
+ if stats['stderr.exit_fail'] && stats['stderr.exit_fail'] > 0
+ # Further analyze specific types of stderr
+ stderr_keys = stats.keys.select { |k| k.start_with?('stderr.') }
+
+ return 'build_timeout_error' if stderr_keys.any? { |k| k.match?(/timeout|time/) }
+ return 'build_memory_error' if stderr_keys.any? { |k| k.match?(/memory|oom/) }
+ return 'build_permission_error' if stderr_keys.any? { |k| k.match?(/permission|access/) }
+ return 'build_exit_error' # Generic exit error
+ end
+
+ # Timeout related statistics
+ if stats['timeout'] && stats['timeout'] > 0
+ return 'build_timeout_error'
+ end
+
+ # Installation related statistics
+ install_keys = stats.keys.select { |k| k.match?(/install.*fail/) }
+ if !install_keys.empty?
+ return 'build_dependency_error'
+ end
+
+ 'unknown'
+end
+
+def analyze_job_state_patterns(job_state, job_data)
+ # 1. Scheduler phase errors
+ return 'scheduler_error' if job_state == 'submit'
+
+ # 2. Queue timeout errors
+ return 'queue_timeout' if job_state == 'queued' && check_if_stuck?(job_data, 'queued')
+
+ # 3. Testbox related errors
+ return 'testbox_error' if %w[boot incomplete timeout].include?(job_state)
+
+ # 4. Running phase stuck
+ return 'execution_stuck' if job_state == 'running' && check_if_stuck?(job_data, 'running')
+
+ # 5. Post-processing phase issues
+ return 'post_processing_error' if job_state == 'post_run' && check_if_stuck?(job_data, 'post_run')
+
+ # 6. Canceled state analysis
+ return 'job_canceled' if job_state == 'canceled'
+
+ # 7. Build failure state - precise analysis based on real stats
+ if job_state == 'failed'
+ return analyze_failed_job_details(job_data)
+ end
+
+ # 8. Default return original state
+ job_state
+end
+
+def analyze_failed_job_details(job_data)
+ stats = extract_stats_data(job_data)
+ return 'failed' unless stats.is_a?(Hash)
+
+ # Based on real Compass CI health status calculation logic
+
+ # 1. Check if build started successfully (based on rpmbuild.start_time.message)
+ build_started = stats.key?('rpmbuild.start_time.message')
+
+ # 2. Check installation phase failures (based on install-rpm.*.fail pattern)
+ install_failed = stats.keys.any? { |k| k.match?(/install-rpm\..*\.fail$/) }
+ service_failed = stats.keys.any? { |k| k.match?(/install-rpm\..*_service_.*\.fail$/) }
+
+ # 3. Check functional test status (based on rpmbuild.func.message)
+ func_success = stats.key?('rpmbuild.func.message')
+
+ # 4. Return error type by priority
+ return 'build_dependency_error' if install_failed
+ return 'service_start_error' if service_failed
+ return 'test_execution_error' if build_started && !func_success
+ return 'build_compile_error' unless build_started
+
+ # Default build failure
+ 'failed'
+end
+
+def customize_solutions(solutions, job_data)
+ return [] unless solutions.is_a?(Array)
+
+ # Build complete template variable mapping
+ template_vars = build_template_variables(job_data)
+
+ solutions.map do |solution|
+ customize_single_solution(solution, template_vars)
+ end
+end
+
+def build_template_variables(job_data)
+ {
+ '${result_root}' => job_data['result_root'] || '/result/path/not/available',
+ '${testbox}' => job_data['testbox'] || 'unknown-testbox',
+ '${suite}' => job_data['suite'] || 'unknown-suite',
+ '${os}' => job_data['os'] || 'unknown-os',
+ '${os_version}' => job_data['os_version'] || 'unknown-version',
+ '${os_arch}' => job_data['os_arch'] || 'unknown-arch',
+ '${job_id}' => job_data['id'] || 'unknown-job-id',
+ '' => extract_service_name(job_data),
+ '' => extract_dependency_name(job_data)
+ }
+end
+
+def customize_single_solution(solution, template_vars)
+ customized = solution.dup
+ template_vars.each do |placeholder, value|
+ customized = customized.gsub(placeholder, value.to_s)
+ end
+ customized
+end
+
+def extract_service_name(job_data)
+ stats = extract_stats_data(job_data)
+ service_key = stats.keys.find { |k| k.match?(/install-rpm.*service/) }
+ return 'unknown-service' unless service_key
+
+ # Extract service name from key: install-rpm.httpd_service_start.fail -> httpd
+ service_key.match(/install-rpm\.(.*)_service/)[1] rescue 'unknown-service'
+end
+
+def extract_dependency_name(job_data)
+ stats = extract_stats_data(job_data)
+ install_key = stats.keys.find { |k| k.match?(/install-rpm.*install.*fail/) }
+ return 'unknown-package' unless install_key
+
+ # Extract package name from key
+ install_key.match(/install-rpm\.(.*)_install/)[1] rescue 'unknown-package'
+end
+
+def calculate_confidence_score(job_state, job_data)
+ score = 0.3
+ score += 0.3 if job_data['build_job_health']
+ score += 0.2 if job_data['install_job_health']
+ score += 0.15 if %w[failed incomplete].include?(job_state)
+ [score, 1.0].min
+end
+
+def default_error_info
+ {
+ title: 'Unknown Error',
+ category: 'unknown',
+ severity: 'medium',
+ causes: ['Unknown cause'],
+ solutions: ['Please contact technical support'],
+ quick_fixes: ['Please contact technical support']
+ }
+end
+
+def log_diagnosis_error(error, context)
+ return unless defined?(log_error)
+
+ log_error({
+ 'component' => 'job_diagnosis',
+ 'context' => context,
+ 'message' => error.message,
+ 'error_type' => 'diagnosis_error'
+ })
+end
diff --git a/lib/job_diagnosis_api.rb b/lib/job_diagnosis_api.rb
new file mode 100644
index 000000000..0456315a0
--- /dev/null
+++ b/lib/job_diagnosis_api.rb
@@ -0,0 +1,502 @@
+# SPDX-License-Identifier: MulanPSL-2.0+
+# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
+# frozen_string_literal: true
+
+# Compass CI diagnosis API interface module
+
+require_relative 'job_diagnosis_core'
+
+def standard_headers
+ { 'Content-Type' => 'application/json', 'Access-Control-Allow-Origin' => '*' }
+end
+
+def get_diagnosis_status
+ return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled?
+
+ status_info = {
+ 'enabled' => true,
+ 'version' => '1.0.0',
+ 'status' => 'operational',
+ 'supported_errors' => DIAGNOSIS_LIBRARY.keys
+ }
+
+ [200, standard_headers, status_info.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'get_diagnosis_status') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Internal server error' }.to_json]
+end
+
+def diagnose_job_by_id(job_id)
+ return { 'error' => 'Diagnosis disabled' } unless diagnosis_enabled?
+ return { 'error' => 'Invalid job_id' } unless job_id.is_a?(String) && !job_id.strip.empty?
+
+ begin
+ # Get job data from data source (implement based on actual data access method)
+ job_data = fetch_job_data(job_id.strip)
+
+ unless job_data && job_data.is_a?(Hash)
+ # Provide fallback diagnosis service
+ return perform_fallback_diagnosis(job_id)
+ end
+
+ # Execute diagnosis
+ diagnosis_result = diagnose_job(job_data)
+
+ # Format return data
+ {
+ 'job_id' => job_id,
+ 'job_state' => job_data.dig('_source', 'job_state') || 'unknown',
+ 'needs_diagnosis' => diagnosis_result[:needs_diagnosis] || false,
+ 'error_type' => diagnosis_result[:error_type],
+ 'confidence' => diagnosis_result[:confidence] || 0,
+ 'description' => diagnosis_result[:description],
+ 'causes' => diagnosis_result[:causes] || [],
+ 'solutions' => diagnosis_result[:solutions] || [],
+ 'quick_fixes' => diagnosis_result[:quick_fixes] || [],
+ 'submit_time' => job_data.dig('_source', 'submit_time'),
+ 'suite' => job_data.dig('_source', 'suite'),
+ 'os' => job_data.dig('_source', 'os'),
+ 'testbox' => job_data.dig('_source', 'testbox')
+ }
+ rescue StandardError => e
+ log_diagnosis_error(e, 'diagnose_job_by_id') if defined?(log_error)
+ {
+ 'job_id' => job_id,
+ 'error' => "Diagnosis service temporarily unavailable: #{e.message}",
+ 'fallback_available' => true,
+ 'suggestion' => 'Data source temporarily unavailable. Please try again later or contact system administrator.',
+ 'needs_diagnosis' => false
+ }
+ end
+end
+
+def perform_fallback_diagnosis(job_id)
+ {
+ 'job_id' => job_id,
+ 'diagnosis_mode' => 'fallback',
+ 'data_source_status' => 'unavailable',
+ 'message' => 'Limited diagnosis available - primary data source temporarily unavailable',
+ 'basic_analysis' => analyze_job_id_pattern(job_id),
+ 'general_suggestions' => [
+ "Verify job ID format: #{job_id}",
+ "Check if the job was submitted correctly",
+ "Wait for data source to become available for detailed diagnosis",
+ "Check system health at /diagnosis/health"
+ ],
+ 'system_status' => {
+ 'elasticsearch' => 'unavailable',
+ 'fallback_service' => 'active'
+ },
+ 'full_diagnosis_available' => false,
+ 'retry_suggestion' => 'Please try again in a few minutes for complete diagnosis'
+ }
+end
+
+def analyze_job_id_pattern(job_id)
+ return 'Invalid job ID format' unless job_id.is_a?(String) && !job_id.strip.empty?
+
+ # Basic format check
+ if job_id.match?(/^\d+$/)
+ "Numeric job ID detected: #{job_id} - appears to be a valid format"
+ elsif job_id.match?(/^[a-zA-Z0-9_-]+$/)
+ "Alphanumeric job ID detected: #{job_id} - appears to be a valid format"
+ else
+ "Unusual job ID format detected: #{job_id} - please verify the ID is correct"
+ end
+end
+
+def fetch_job_data(job_id)
+ begin
+ # Use existing ES_CLIENT for query
+ if defined?(ES_CLIENT)
+ query = {
+ query: {
+ bool: {
+ must: [{ term: { id: job_id } }]
+ }
+ },
+ size: 1
+ }
+
+ result = ES_CLIENT.search(index: 'jobs*', body: query)
+ hits = result['hits']['hits']
+
+ return hits.first if hits && !hits.empty?
+ end
+
+ # Return nil if ES_CLIENT is unavailable or query fails
+ nil
+ rescue StandardError => e
+ warn "[COMPASS-CI] Failed to fetch job data for #{job_id}: #{e.message}" if ENV['DEBUG']
+ nil
+ end
+end
+
+def get_diagnosis_error_types
+ return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled?
+
+ error_types = DIAGNOSIS_LIBRARY.keys.map do |type|
+ info = DIAGNOSIS_LIBRARY[type]
+ {
+ 'type' => type,
+ 'title' => info[:title],
+ 'category' => info[:category],
+ 'severity' => info[:severity]
+ }
+ end
+
+ response_data = {
+ 'error_types' => error_types,
+ 'total' => error_types.length
+ }
+
+ [200, standard_headers, response_data.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'get_diagnosis_error_types') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Internal server error' }.to_json]
+end
+
+# Get error guidance information
+def get_diagnosis_error_guidance(error_type)
+ return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled?
+ return [400, standard_headers, { 'error' => 'Missing error_type' }.to_json] unless error_type
+ return [404, standard_headers, { 'error' => 'Error type not found' }.to_json] unless DIAGNOSIS_LIBRARY.key?(error_type)
+
+ guidance = DIAGNOSIS_LIBRARY[error_type]
+ [200, standard_headers, guidance.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'get_diagnosis_error_guidance') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Internal server error' }.to_json]
+end
+
+# Start interactive troubleshooting wizard
+def start_diagnosis_wizard(error_type, difficulty_level = 'intermediate')
+ return [400, standard_headers, { 'error' => 'Invalid error_type' }.to_json] unless DIAGNOSIS_LIBRARY.key?(error_type)
+
+ error_info = DIAGNOSIS_LIBRARY[error_type]
+ wizard_steps = generate_wizard_steps(error_info, difficulty_level)
+
+ wizard_session = {
+ 'wizard_id' => generate_wizard_id,
+ 'error_type' => error_type,
+ 'difficulty_level' => difficulty_level,
+ 'total_steps' => wizard_steps.length,
+ 'current_step' => 1,
+ 'steps' => wizard_steps,
+ 'started_at' => Time.now.to_s
+ }
+
+ [200, standard_headers, wizard_session.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'start_diagnosis_wizard') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Failed to start wizard' }.to_json]
+end
+
+# Generate wizard steps
+def generate_wizard_steps(error_info, difficulty_level)
+ base_steps = [
+ {
+ 'step' => 1,
+ 'title' => 'Problem Confirmation',
+ 'description' => "Confirm you are experiencing: #{error_info[:title]}",
+ 'action' => 'Please confirm if the error type is correct',
+ 'expected_response' => 'yes/no'
+ },
+ {
+ 'step' => 2,
+ 'title' => 'Environment Check',
+ 'description' => 'Check build environment status',
+ 'action' => 'Please check system resources and network connection',
+ 'expected_response' => 'status'
+ }
+ ]
+
+ # Add steps based on difficulty level
+ case difficulty_level
+ when 'beginner'
+ base_steps += [
+ {
+ 'step' => 3,
+ 'title' => 'Basic Check',
+ 'description' => 'Execute basic environment checks',
+ 'action' => 'Please execute the following commands one by one',
+ 'commands' => error_info[:quick_fixes],
+ 'expected_response' => 'command_results'
+ }
+ ]
+ when 'expert'
+ base_steps += [
+ {
+ 'step' => 3,
+ 'title' => 'Advanced Diagnosis',
+ 'description' => 'Execute deep system analysis',
+ 'action' => 'Please conduct comprehensive environment and log analysis',
+ 'expected_response' => 'detailed_analysis'
+ }
+ ]
+ else
+ base_steps += [
+ {
+ 'step' => 3,
+ 'title' => 'Detailed Analysis',
+ 'description' => 'Analyze specific error causes',
+ 'possible_causes' => error_info[:causes],
+ 'expected_response' => 'analysis'
+ },
+ {
+ 'step' => 4,
+ 'title' => 'Solution',
+ 'description' => 'Execute recommended solutions',
+ 'solutions' => error_info[:solutions],
+ 'expected_response' => 'solution_result'
+ }
+ ]
+ end
+
+ base_steps
+end
+
+# Generate wizard ID
+def generate_wizard_id
+ "wizard_#{Time.now.to_i}_#{rand(1000)}"
+end
+
+# Process wizard steps - implement real session management
+def process_wizard_step(wizard_id, step_number, user_response = nil)
+ return [400, standard_headers, { 'error' => 'Missing wizard_id' }.to_json] unless wizard_id
+ return [400, standard_headers, { 'error' => 'Missing step_number' }.to_json] unless step_number
+
+ # Validate wizard_id format
+ unless wizard_id.match?(/^wizard_\d+_\d+$/)
+ return [400, standard_headers, { 'error' => 'Invalid wizard_id format' }.to_json]
+ end
+
+ # Generate step guidance
+ step_guidance = generate_step_guidance(step_number.to_i, user_response)
+
+ # Add session information
+ step_guidance['wizard_id'] = wizard_id
+ step_guidance['session_valid'] = true
+ step_guidance['timestamp'] = Time.now.to_s
+
+ [200, standard_headers, step_guidance.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'process_wizard_step') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Failed to process wizard step' }.to_json]
+end
+
+# Generate step guidance
+def generate_step_guidance(step_number, user_response)
+ base_guidance = {
+ 'step_number' => step_number,
+ 'user_response' => user_response,
+ 'timestamp' => Time.now.to_s
+ }
+
+ case step_number
+ when 1
+ base_guidance.merge({
+ 'guidance' => 'Problem confirmation steps',
+ 'next_action' => user_response == 'yes' ? 'Continue to environment check' : 'Reselect error type',
+ 'instructions' => user_response == 'yes' ?
+ ['Problem confirmed successfully, ready for environment check'] :
+ ['Please reconfirm the error type you encountered']
+ })
+ when 2
+ base_guidance.merge({
+ 'guidance' => 'Environment check steps',
+ 'next_action' => 'Continue diagnosis based on environment status',
+ 'instructions' => [
+ 'Check system resource usage',
+ 'Verify network connection status',
+ 'Confirm necessary service running status'
+ ]
+ })
+ when 3
+ base_guidance.merge({
+ 'guidance' => 'Detailed analysis steps',
+ 'next_action' => 'Execute specific solutions',
+ 'instructions' => [
+ 'Analyze error log content',
+ 'Check configuration file correctness',
+ 'Verify dependency relationship integrity'
+ ]
+ })
+ when 4
+ base_guidance.merge({
+ 'guidance' => 'Solution execution steps',
+ 'next_action' => 'Verify if problem is resolved',
+ 'instructions' => [
+ 'Execute recommended solutions step by step',
+ 'Record execution process and results',
+ 'Verify if problem is resolved'
+ ]
+ })
+ else
+ base_guidance.merge({
+ 'guidance' => 'Diagnosis completed',
+ 'next_action' => 'Problem resolved or seek further help',
+ 'instructions' => ['Diagnosis process completed, please check if problem is resolved']
+ })
+ end
+end
+
+# Enhanced environment health check - deep integration with project core components
+def perform_environment_health_check
+ return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled?
+
+ health_results = {
+ 'check_time' => Time.now.to_s,
+ 'overall_status' => 'healthy',
+ 'checks' => {
+ 'diagnosis_library' => {
+ 'status' => DIAGNOSIS_LIBRARY.any? ? 'ok' : 'error',
+ 'details' => "Supports #{DIAGNOSIS_LIBRARY.keys.length} error types"
+ },
+ 'environment' => {
+ 'status' => diagnosis_enabled? ? 'ok' : 'disabled',
+ 'details' => "COMPASS_DIAGNOSIS_ENABLED = #{ENV['COMPASS_DIAGNOSIS_ENABLED']}"
+ },
+ 'scheduler' => check_scheduler_health,
+ 'testboxes' => check_testboxes_health,
+ 'elasticsearch' => check_elasticsearch_health,
+ 'git_mirror' => check_git_mirror_health
+ }
+ }
+
+ # Calculate overall status
+ failed_checks = health_results['checks'].count { |_, check| check['status'] == 'error' }
+ health_results['overall_status'] = failed_checks > 0 ? 'unhealthy' : 'healthy'
+
+ [200, standard_headers, health_results.to_json]
+rescue StandardError => e
+ log_diagnosis_error(e, 'perform_environment_health_check') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Health check failed' }.to_json]
+end
+
+# Check Scheduler health status - integrate ETCD check
+def check_scheduler_health
+ return { 'status' => 'disabled', 'details' => 'ETCD client unavailable' } unless defined?(ETCD_CLIENT)
+
+ begin
+ # Check scheduler queue status
+ queues = ETCD_CLIENT.get('/queues/sched/ready', range_end: '/queues/sched/ready/zzzzzzzzzzz').to_h
+ queue_count = queues.keys.length
+
+ {
+ 'status' => 'ok',
+ 'details' => "Scheduler queue normal, current queue count: #{queue_count}"
+ }
+ rescue StandardError => e
+ {
+ 'status' => 'error',
+ 'details' => "Scheduler queue check failed: #{e.message}"
+ }
+ end
+end
+
+# Check Testboxes health status - integrate search_testboxes
+def check_testboxes_health
+ return { 'status' => 'disabled', 'details' => 'search_testboxes function unavailable' } unless defined?(search_testboxes)
+
+ begin
+ testboxes, total = search_testboxes
+ active_count = total || 0
+
+ {
+ 'status' => active_count > 0 ? 'ok' : 'warning',
+ 'details' => "Available testbox count: #{active_count}"
+ }
+ rescue StandardError => e
+ {
+ 'status' => 'warning',
+ 'details' => "Unable to check testbox status: #{e.message}"
+ }
+ end
+end
+
+# Check Elasticsearch health status - integrate ES_CLIENT
+def check_elasticsearch_health
+ return { 'status' => 'disabled', 'details' => 'ES client unavailable' } unless defined?(ES_CLIENT)
+
+ begin
+ ES_CLIENT.ping
+ {
+ 'status' => 'ok',
+ 'details' => 'Elasticsearch connection normal'
+ }
+ rescue StandardError => e
+ {
+ 'status' => 'error',
+ 'details' => "Elasticsearch connection failed: #{e.message}"
+ }
+ end
+end
+
+# Check Git mirror health status - integrate existing health check
+def check_git_mirror_health
+ return { 'status' => 'disabled', 'details' => 'git_mirror_health function unavailable' } unless defined?(git_mirror_health)
+
+ begin
+ result = git_mirror_health
+ status_code = result.is_a?(Array) ? result[0] : 500
+
+ {
+ 'status' => status_code == 200 ? 'ok' : 'warning',
+ 'details' => 'Code mirror health status check completed'
+ }
+ rescue StandardError => e
+ {
+ 'status' => 'warning',
+ 'details' => "Unable to check code mirror status: #{e.message}"
+ }
+ end
+end
+
+# Generate simplified diagnosis report
+def generate_diagnosis_report(job_data, diagnosis_data)
+ return nil unless job_data.is_a?(Hash) && diagnosis_data.is_a?(Hash)
+
+ report = {
+ 'report_id' => "report_#{job_data['id']}_#{Time.now.to_i}",
+ 'generated_at' => Time.now.to_s,
+ 'job_id' => job_data['id'],
+ 'job_state' => job_data['job_state'],
+ 'error_type' => diagnosis_data['error_type'],
+ 'title' => diagnosis_data['title'],
+ 'severity' => diagnosis_data['severity'],
+ 'possible_causes' => diagnosis_data['possible_causes'],
+ 'suggested_solutions' => diagnosis_data['suggested_solutions'],
+ 'quick_fixes' => diagnosis_data['quick_fixes'],
+ 'result_root' => job_data['result_root']
+ }
+
+ format_simple_report(report)
+rescue StandardError => e
+ log_diagnosis_error(e, 'generate_diagnosis_report') if defined?(log_error)
+ "Report generation failed: #{e.message}"
+end
+
+# Format simplified report
+def format_simple_report(report)
+ text = []
+ text << "Compass CI Diagnosis Report"
+ text << "=" * 50
+ text << "Job ID: #{report['job_id']}"
+ text << "Status: #{report['job_state']}"
+ text << "Error Type: #{report['title']}"
+ text << "Severity: #{report['severity']}"
+ text << ""
+ text << "Possible Causes:"
+ report['possible_causes'].each { |c| text << "- #{c}" }
+ text << ""
+ text << "Suggested Solutions:"
+ report['suggested_solutions'].each_with_index { |s, i| text << "#{i+1}. #{s}" }
+ text << ""
+ text << "Quick Fixes:"
+ report['quick_fixes'].each { |f| text << "- #{f}" }
+ text << ""
+ text << "Result Directory: #{report['result_root']}" if report['result_root']
+
+ text.join("\n")
+end
diff --git a/lib/job_diagnosis_library.rb b/lib/job_diagnosis_library.rb
new file mode 100644
index 000000000..63a00f646
--- /dev/null
+++ b/lib/job_diagnosis_library.rb
@@ -0,0 +1,378 @@
+# SPDX-License-Identifier: MulanPSL-2.0+
+# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
+# frozen_string_literal: true
+
+# Compass CI diagnosis knowledge base - error types and solutions
+
+DIAGNOSIS_LIBRARY = {
+ 'failed' => {
+ title: 'Build Failed',
+ category: 'build_failure',
+ severity: 'high',
+ causes: [
+ 'Missing build dependencies - check BuildRequires',
+ 'Source compilation error - check build log',
+ 'Configuration file error - verify spec file'
+ ],
+ solutions: [
+ '1. Check build log file under result_root',
+ '2. Verify all BuildRequires dependencies',
+ '3. Check spec file syntax and configuration'
+ ],
+ quick_fixes: [
+ 'Check ${result_root}/build.log',
+ 'Run rpm -qa | grep -i dependency_name',
+ 'Verify spec file format'
+ ]
+ },
+ 'scheduler_error' => {
+ title: 'Scheduler Error',
+ category: 'scheduler_failure',
+ severity: 'high',
+ causes: [
+ 'Scheduler service exception - check scheduler status',
+ 'Job queue overloaded - check queue status',
+ 'Job format error - verify job.yaml'
+ ],
+ solutions: [
+ '1. Check scheduler service status and logs',
+ '2. Verify job submission format and parameters',
+ '3. Check target testbox availability'
+ ],
+ quick_fixes: [
+ 'Check scheduler service status',
+ 'Verify testbox availability',
+ 'Resubmit job'
+ ]
+ },
+ 'testbox_error' => {
+ title: 'Testbox Execution Error',
+ category: 'testbox_failure',
+ severity: 'high',
+ causes: [
+ 'Testbox startup timeout - check boot_time',
+ 'Testbox resource shortage - check CPU/memory',
+ 'Testbox network issues - check connection status'
+ ],
+ solutions: [
+ '1. Check testbox startup time and thresholds',
+ '2. Verify testbox resource configuration',
+ '3. Check network connection and download speed'
+ ],
+ quick_fixes: [
+ 'Check testbox status: ${testbox}',
+ 'Reassign to another testbox',
+ 'Increase resource configuration'
+ ]
+ },
+ 'incomplete' => {
+ title: 'Job Incomplete',
+ category: 'job_incomplete',
+ severity: 'medium',
+ causes: [
+ 'System startup timeout - check boot_time',
+ 'Memory shortage - check system resources',
+ 'Network issues - check connection status'
+ ],
+ solutions: [
+ '1. Check if testbox boot_time exceeds threshold',
+ '2. Verify system resource configuration (CPU/memory)',
+ '3. Check network connection and download status'
+ ],
+ quick_fixes: [
+ 'Check testbox status',
+ 'Resubmit job to another testbox',
+ 'Increase resource configuration'
+ ]
+ },
+ 'build_dependency_error' => {
+ title: 'RPM Build Dependency Error',
+ category: 'dependency_issue',
+ severity: 'high',
+ causes: [
+ 'RPM package dependencies missing',
+ 'Repository configuration error',
+ 'Version conflict issues'
+ ],
+ solutions: [
+ '1. Check ${result_root}/install.log',
+ '2. Verify repository configuration and availability',
+ '3. Resolve dependency package version conflicts'
+ ],
+ quick_fixes: [
+ 'yum clean all && yum makecache',
+ 'Check repository configuration files',
+ 'Manually install missing dependencies'
+ ]
+ },
+ 'test_execution_error' => {
+ title: 'Test Execution Error',
+ category: 'test_failure',
+ severity: 'medium',
+ causes: [
+ 'Service startup failure',
+ 'Command execution error',
+ 'Permission issues'
+ ],
+ solutions: [
+ '1. Check service configuration and dependencies',
+ '2. Verify execution permissions',
+ '3. Analyze specific error information'
+ ],
+ quick_fixes: [
+ 'systemctl status ',
+ 'Check file permissions',
+ 'View detailed error logs'
+ ]
+ },
+ 'queue_timeout' => {
+ title: 'Queue Timeout',
+ category: 'scheduler_issue',
+ severity: 'medium',
+ causes: [
+ 'System overload - insufficient available testboxes',
+ 'Priority setting issues - job priority too low',
+ 'Scheduler scheduling strategy issues'
+ ],
+ solutions: [
+ '1. Check overall system load and queue status',
+ '2. Adjust job priority or resource requirements',
+ '3. Contact administrator to check scheduler configuration',
+ '4. Consider resubmitting during off-peak hours'
+ ],
+ quick_fixes: [
+ 'Check available testbox count',
+ 'Reduce resource requirements and resubmit',
+ 'Adjust job priority'
+ ]
+ },
+ 'execution_stuck' => {
+ title: 'Execution Stuck',
+ category: 'execution_issue',
+ severity: 'high',
+ causes: [
+ 'Test case infinite loop or endless waiting',
+ 'System resource shortage causing deadlock',
+ 'Network connection interruption',
+ 'Testbox hardware failure'
+ ],
+ solutions: [
+ '1. Check if test cases have infinite loops',
+ '2. Monitor testbox resource usage',
+ '3. Verify network connection stability',
+ '4. Consider terminating and resubmitting job'
+ ],
+ quick_fixes: [
+ 'Terminate current job',
+ 'Check ${testbox} resource usage',
+ 'Resubmit to another testbox'
+ ]
+ },
+ 'post_processing_error' => {
+ title: 'Post-processing Error',
+ category: 'post_processing_issue',
+ severity: 'medium',
+ causes: [
+ 'Result file upload failed',
+ 'Statistical data processing error',
+ 'Insufficient storage space',
+ 'Network transmission interrupted'
+ ],
+ solutions: [
+ '1. Check result storage directory status',
+ '2. Verify network connection and transmission status',
+ '3. Check if storage space is sufficient',
+ '4. Retrigger post-processing workflow'
+ ],
+ quick_fixes: [
+ 'Check storage space: df -h',
+ 'Verify network connection',
+ 'Check ${result_root} permissions'
+ ]
+ },
+ 'job_canceled' => {
+ title: 'Job Canceled',
+ category: 'job_management',
+ severity: 'low',
+ causes: [
+ 'User actively canceled job',
+ 'System auto-canceled (insufficient resources)',
+ 'Exceeded maximum runtime limit',
+ 'Scheduler policy auto-canceled'
+ ],
+ solutions: [
+ '1. Confirm cancellation reason and source',
+ '2. Check job configuration and resource requirements',
+ '3. Adjust parameters and resubmit',
+ '4. Contact administrator about cancellation policy'
+ ],
+ quick_fixes: [
+ 'Check cancellation reason',
+ 'Adjust resource requirements',
+ 'Resubmit job'
+ ]
+ },
+ 'build_compile_error' => {
+ title: 'Compile Error',
+ category: 'build_failure',
+ severity: 'high',
+ causes: [
+ 'Source code syntax error or compiler version incompatible',
+ 'Missing compile-time dependency header files',
+ 'Compiler parameter configuration error'
+ ],
+ solutions: [
+ '1. Check compilation error information in ${result_root}/build.log',
+ '2. Verify compiler version and source code compatibility',
+ '3. Check development package dependencies in BuildRequires'
+ ],
+ quick_fixes: [
+ 'View compilation error details: cat ${result_root}/build.log | grep -A5 -B5 error',
+ 'Check compiler version: gcc --version',
+ 'Verify development packages: rpm -qa | grep -E "devel|dev"'
+ ]
+ },
+ 'build_spec_error' => {
+ title: 'Spec File Error',
+ category: 'build_failure',
+ severity: 'high',
+ causes: [
+ 'Spec file syntax error',
+ 'Macro definition error or missing',
+ 'File path or permission configuration error'
+ ],
+ solutions: [
+ '1. Check spec file syntax correctness',
+ '2. Verify all macro definitions and variables',
+ '3. Confirm file list and permission settings'
+ ],
+ quick_fixes: [
+ 'Syntax check: rpmlint ${suite}.spec',
+ 'Macro expansion check: rpm --eval "%{_builddir}"',
+ 'File list verification: rpm -qpl ${result_root}/*.rpm'
+ ]
+ },
+ 'build_exit_error' => {
+ title: 'Build Process Abnormal Exit',
+ category: 'build_failure',
+ severity: 'high',
+ causes: [
+ 'Build script execution failed',
+ 'Process terminated due to insufficient system resources',
+ 'Fatal error encountered during build process'
+ ],
+ solutions: [
+ '1. Check specific error information in stderr',
+ '2. Verify system resource usage',
+ '3. Check build script execution logic'
+ ],
+ quick_fixes: [
+ 'View error output: cat ${result_root}/stderr',
+ 'Check system resources: free -m && df -h',
+ 'Verify build script: bash -x ${result_root}/job.sh'
+ ]
+ },
+ 'build_timeout_error' => {
+ title: 'Build Timeout',
+ category: 'build_failure',
+ severity: 'medium',
+ causes: [
+ 'Build process takes too long and exceeds limit',
+ 'Build process stuck and unresponsive',
+ 'High system load affecting build speed'
+ ],
+ solutions: [
+ '1. Check build logs to confirm stuck location',
+ '2. Optimize build scripts to improve efficiency',
+ '3. Appropriately increase timeout limit'
+ ],
+ quick_fixes: [
+ 'Check last activity: tail ${result_root}/build.log',
+ 'View system load: uptime',
+ 'Check timeout configuration: grep -i timeout ${result_root}/job.yaml'
+ ]
+ },
+ 'build_memory_error' => {
+ title: 'Memory Insufficient Error',
+ category: 'build_failure',
+ severity: 'high',
+ causes: [
+ 'Build process consumes too much memory',
+ 'Insufficient system available memory',
+ 'Memory leak causing OOM'
+ ],
+ solutions: [
+ '1. Check system memory usage',
+ '2. Optimize build process to reduce memory consumption',
+ '3. Consider using testbox with larger memory'
+ ],
+ quick_fixes: [
+ 'Check memory usage: free -m',
+ 'View OOM logs: dmesg | grep -i "killed process"',
+ 'Check swap usage: swapon -s'
+ ]
+ },
+ 'build_permission_error' => {
+ title: 'Permission Error',
+ category: 'build_failure',
+ severity: 'medium',
+ causes: [
+ 'Insufficient file or directory permissions',
+ 'User permission configuration error',
+ 'SELinux or security policy restrictions'
+ ],
+ solutions: [
+ '1. Check related file and directory permissions',
+ '2. Verify user permission configuration',
+ '3. Check SELinux policy settings'
+ ],
+ quick_fixes: [
+ 'Check permissions: ls -la ${result_root}/',
+ 'View current user: whoami && groups',
+ 'Check SELinux: getenforce && sestatus'
+ ]
+ },
+ 'build_unknown_error' => {
+ title: 'Unknown Build Error',
+ category: 'build_failure',
+ severity: 'medium',
+ causes: [
+ 'Specific error cause requires further analysis',
+ 'Possibly a new type of build issue',
+ 'Error information insufficient or incomplete'
+ ],
+ solutions: [
+ '1. Thoroughly check all build logs',
+ '2. Compare differences with similar successful builds',
+ '3. Contact technical support for in-depth analysis'
+ ],
+ quick_fixes: [
+ 'Comprehensive log check: find ${result_root} -name "*.log" -exec cat {} \\;',
+ 'Error information search: grep -r -i error ${result_root}/',
+ 'Submit detailed report: include job_id and complete logs'
+ ]
+ },
+ 'service_start_error' => {
+ title: 'Service Start Failure',
+ category: 'service_failure',
+ severity: 'high',
+ causes: [
+ 'Service configuration file error or missing',
+ 'Port conflict or insufficient permissions',
+ 'Dependent services not started',
+ 'Insufficient system resources'
+ ],
+ solutions: [
+ '1. Check service configuration file correctness',
+ '2. Verify port usage and permission settings',
+ '3. Confirm dependent service status',
+ '4. Check system resource usage'
+ ],
+ quick_fixes: [
+ 'Check service status: systemctl status ${service_name}',
+ 'View service logs: journalctl -u ${service_name} --no-pager',
+ 'Check port usage: netstat -tlnp | grep ${port}',
+ 'Verify configuration file: ${service_name} -t'
+ ]
+ }
+}.freeze
diff --git a/lib/job_diagnosis_monitoring.rb b/lib/job_diagnosis_monitoring.rb
new file mode 100644
index 000000000..860efe7be
--- /dev/null
+++ b/lib/job_diagnosis_monitoring.rb
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: MulanPSL-2.0+
+# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved.
+# frozen_string_literal: true
+
+# Compass CI diagnosis - real-time monitoring and alerting module
+# Provide proactive monitoring and alerting mechanisms to improve build success rate
+
+require_relative 'job_diagnosis_core'
+
+# System health monitoring and alerting
+def monitor_system_health
+ return unless diagnosis_enabled?
+
+ health_metrics = collect_health_metrics
+ warnings = analyze_health_trends(health_metrics)
+
+ if warnings.any?
+ send_health_warnings(warnings)
+ end
+
+ health_metrics
+rescue StandardError => e
+ log_diagnosis_error(e, 'monitor_system_health') if defined?(log_error)
+ {}
+end
+
+# Collect system health metrics
+def collect_health_metrics
+ metrics = {
+ 'timestamp' => Time.now.to_i,
+ 'scheduler_queue_size' => get_scheduler_queue_size,
+ 'active_testboxes' => get_active_testbox_count,
+ 'failed_jobs_rate' => calculate_failed_jobs_rate,
+ 'average_queue_time' => calculate_average_queue_time,
+ 'system_load' => get_system_load,
+ 'available_memory' => get_available_memory
+ }
+
+ metrics.compact
+end
+
+# Get scheduler queue size
+def get_scheduler_queue_size
+ return nil unless defined?(ETCD_CLIENT)
+
+ begin
+ queues = ETCD_CLIENT.get('/queues/sched/ready', range_end: '/queues/sched/ready/zzzzzzzzzzz').to_h
+ queues.keys.length
+ rescue StandardError
+ nil
+ end
+end
+
+# Get active testbox count
+def get_active_testbox_count
+ return nil unless defined?(search_testboxes)
+
+ begin
+ _, total = search_testboxes
+ total
+ rescue StandardError
+ nil
+ end
+end
+
+# Calculate failed job rate
+def calculate_failed_jobs_rate
+ return nil unless defined?(es_query)
+
+ begin
+ # Query jobs from the last hour
+ one_hour_ago = (Time.now - 3600).strftime('%Y-%m-%dT%H:%M:%S')
+
+ query = {
+ query: {
+ range: {
+ submit_time: {
+ gte: one_hour_ago
+ }
+ }
+ },
+ aggs: {
+ by_state: {
+ terms: {
+ field: 'job_state.keyword'
+ }
+ }
+ },
+ size: 0
+ }
+
+ result = es_query(query)
+ buckets = result.dig('aggregations', 'by_state', 'buckets') || []
+
+ total_jobs = buckets.sum { |bucket| bucket['doc_count'] }
+ failed_jobs = buckets.find { |bucket| bucket['key'] == 'failed' }&.[]('doc_count') || 0
+
+ total_jobs > 0 ? (failed_jobs.to_f / total_jobs * 100).round(2) : 0
+ rescue StandardError
+ nil
+ end
+end
+
+# Calculate average queue time
+def calculate_average_queue_time
+ return nil unless defined?(es_query)
+
+ begin
+ # Query queue time for the last 100 completed jobs
+ query = {
+ query: {
+ bool: {
+ must: [
+ { exists: { field: 'boot_time' } },
+ { exists: { field: 'submit_time' } }
+ ]
+ }
+ },
+ sort: [
+ { submit_time: { order: 'desc' } }
+ ],
+ size: 100,
+ _source: ['submit_time', 'boot_time']
+ }
+
+ result = es_query(query)
+ jobs = result.dig('hits', 'hits') || []
+
+ queue_times = jobs.filter_map do |job|
+ source = job['_source']
+ next unless source['submit_time'] && source['boot_time']
+
+ submit_time = Time.parse(source['submit_time'])
+ boot_time = Time.parse(source['boot_time'])
+ boot_time - submit_time
+ rescue StandardError
+ nil
+ end
+
+ queue_times.any? ? (queue_times.sum / queue_times.length / 60).round(2) : nil
+ rescue StandardError
+ nil
+ end
+end
+
+# Get system load - cross-platform compatible
+def get_system_load
+ begin
+ if RUBY_PLATFORM.match?(/win32|mingw|mswin/)
+ # Windows: use wmic to get CPU usage as load indicator
+ cpu_usage = `wmic cpu get loadpercentage /value`.match(/LoadPercentage=(\d+)/)[1].to_f
+ cpu_usage / 100.0 * 4.0 # Convert to Linux load average-like value
+ else
+ # Linux/Unix: use uptime command
+ load_avg = `uptime`.match(/load average: ([\d\.]+)/)[1].to_f
+ load_avg
+ end
+ rescue StandardError
+ nil
+ end
+end
+
+# Get available memory - cross-platform compatible
+def get_available_memory
+ begin
+ if RUBY_PLATFORM.match?(/win32|mingw|mswin/)
+ # Windows: use wmic to get memory information
+ total_mem = `wmic computersystem get TotalPhysicalMemory /value`.match(/TotalPhysicalMemory=(\d+)/)[1].to_i / 1024 / 1024
+ free_mem = `wmic OS get FreePhysicalMemory /value`.match(/FreePhysicalMemory=(\d+)/)[1].to_i / 1024
+ free_mem
+ else
+ # Linux/Unix: use free command
+ mem_info = `free -m`.lines[1].split
+ mem_info[6].to_i
+ end
+ rescue StandardError
+ nil
+ end
+end
+
+# Analyze health trends and generate alerts
+def analyze_health_trends(metrics)
+ warnings = []
+
+ # Check queue size alerts
+ if metrics['scheduler_queue_size'] && metrics['scheduler_queue_size'] > 100
+ warnings << {
+ type: 'high_queue_size',
+ severity: 'medium',
+ message: "Scheduler queue too long: #{metrics['scheduler_queue_size']} jobs waiting",
+ recommendations: [
+ 'Check available testbox count',
+ 'Consider increasing testbox resources',
+ 'Analyze queue backlog causes'
+ ]
+ }
+ end
+
+ # Check failure rate alerts
+ if metrics['failed_jobs_rate'] && metrics['failed_jobs_rate'] > 20
+ warnings << {
+ type: 'high_failure_rate',
+ severity: 'high',
+ message: "Job failure rate too high: #{metrics['failed_jobs_rate']}%",
+ recommendations: [
+ 'Check system component health status',
+ 'Analyze common causes of recent failed jobs',
+ 'Consider pausing new job submissions'
+ ]
+ }
+ end
+
+ # Check average queue time alerts
+ if metrics['average_queue_time'] && metrics['average_queue_time'] > 60
+ warnings << {
+ type: 'long_queue_time',
+ severity: 'medium',
+ message: "Average queue time too long: #{metrics['average_queue_time']} minutes",
+ recommendations: [
+ 'Increase available testbox count',
+ 'Optimize scheduling strategy',
+ 'Check resource allocation configuration'
+ ]
+ }
+ end
+
+ # Check system load alerts
+ if metrics['system_load'] && metrics['system_load'] > 4.0
+ warnings << {
+ type: 'high_system_load',
+ severity: 'high',
+ message: "System load too high: #{metrics['system_load']}",
+ recommendations: [
+ 'Check system resource usage',
+ 'Consider limiting concurrent job count',
+ 'Optimize system performance configuration'
+ ]
+ }
+ end
+
+ # Check memory alerts
+ if metrics['available_memory'] && metrics['available_memory'] < 200
+ warnings << {
+ type: 'low_memory',
+ severity: 'high',
+ message: "Insufficient available memory: #{metrics['available_memory']} MB",
+ recommendations: [
+ 'Release unnecessary memory usage',
+ 'Restart high memory consuming services',
+ 'Consider upgrading system memory'
+ ]
+ }
+ end
+
+ warnings
+end
+
+# Send health alerts
+def send_health_warnings(warnings)
+ return unless warnings.any?
+
+ # Log alerts to system log
+ warnings.each do |warning|
+ log_warning = {
+ 'component' => 'system_health_monitor',
+ 'type' => warning[:type],
+ 'severity' => warning[:severity],
+ 'message' => warning[:message],
+ 'recommendations' => warning[:recommendations],
+ 'timestamp' => Time.now.to_s
+ }
+
+ if defined?(log_error)
+ log_error(log_warning)
+ else
+ warn "[HEALTH WARNING] #{warning[:message]}"
+ end
+ end
+
+ # Can be extended to send email, SMS and other notification methods
+ # send_email_notification(warnings) if defined?(send_email_notification)
+ # send_slack_notification(warnings) if defined?(send_slack_notification)
+end
+
+# Get monitoring status summary
+def get_monitoring_status
+ return [503, standard_headers, { 'error' => 'Monitoring disabled' }.to_json] unless diagnosis_enabled?
+
+ begin
+ current_metrics = collect_health_metrics
+ warnings = analyze_health_trends(current_metrics)
+
+ status = {
+ 'monitoring_enabled' => true,
+ 'last_check' => Time.now.to_s,
+ 'current_metrics' => current_metrics,
+ 'active_warnings' => warnings,
+ 'system_status' => warnings.any? ? 'warning' : 'healthy'
+ }
+
+ [200, standard_headers, status.to_json]
+ rescue StandardError => e
+ log_diagnosis_error(e, 'get_monitoring_status') if defined?(log_error)
+ [500, standard_headers, { 'error' => 'Monitoring check failed' }.to_json]
+ end
+end
+
+# Standard HTTP headers already defined in job_diagnosis_api.rb, no need to duplicate
diff --git a/src/lib/web_backend.rb b/src/lib/web_backend.rb
index cd28e78bd..1dcf7bb49 100644
--- a/src/lib/web_backend.rb
+++ b/src/lib/web_backend.rb
@@ -26,6 +26,20 @@ require_relative './api_input_check.rb'
require_relative '../../lib/json_logger.rb'
require_relative './jwt.rb'
+# Compass CI diagnosis feature integration - complete module loading
+if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true'
+ begin
+ require_relative '../../lib/job_diagnosis_core.rb'
+ require_relative '../../lib/job_diagnosis_api.rb'
+ require_relative '../../lib/job_diagnosis_monitoring.rb'
+ puts "[COMPASS-CI] Diagnosis modules loaded successfully" if ENV['DEBUG']
+ rescue LoadError => e
+ warn "[COMPASS-CI] Diagnosis modules not found: #{e.message}"
+ rescue StandardError => e
+ warn "[COMPASS-CI] Diagnosis load failed: #{e.message}"
+ end
+end
+
UPSTREAM_REPOS_PATH = ENV['UPSTREAM_REPOS_PATH'] || '/c/upstream-repos'
FIELDS = %w[
@@ -322,9 +336,109 @@ def get_jobs_result(result)
end
jobs << job
end
+
+ # Diagnosis feature enhancement (following project functional style)
+ jobs = diagnose_jobs(jobs) if defined?(diagnose_jobs)
jobs
end
+# Diagnosis feature API interfaces (following project functional style)
+def diagnosis_status
+ return get_diagnosis_status if defined?(get_diagnosis_status)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+def diagnosis_error_types
+ return get_diagnosis_error_types if defined?(get_diagnosis_error_types)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+def diagnosis_error_guidance(params)
+ return get_diagnosis_error_guidance(params[:error_type]) if defined?(get_diagnosis_error_guidance)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+def diagnosis_health_check
+ return perform_environment_health_check if defined?(perform_environment_health_check)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+# Diagnose specific job
+def diagnosis_diagnose_job(job_id)
+ return diagnose_job_by_id(job_id) if defined?(diagnose_job_by_id)
+ { 'error' => 'Diagnosis not available' }
+end
+
+def diagnosis_start_wizard(params)
+ return start_diagnosis_wizard(params[:error_type], params[:difficulty_level]) if defined?(start_diagnosis_wizard)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+# Process wizard steps
+def diagnosis_wizard_step(params)
+ return process_wizard_step(params[:wizard_id], params[:step_number], params[:user_response]) if defined?(process_wizard_step)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json]
+end
+
+# Get monitoring status
+def diagnosis_monitoring_status
+ return get_monitoring_status if defined?(get_monitoring_status)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Monitoring not available' }.to_json]
+end
+
+# Get system health metrics
+def diagnosis_system_health
+ return monitor_system_health if defined?(monitor_system_health)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'System health monitoring not available' }.to_json]
+end
+
+# Trigger health check
+def diagnosis_health_metrics
+ return collect_health_metrics if defined?(collect_health_metrics)
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Health metrics collection not available' }.to_json]
+end
+
+# Generate diagnosis report
+def diagnosis_generate_report(params)
+ return [400, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Missing job_id' }.to_json] unless params[:job_id]
+
+ if defined?(generate_diagnosis_report) && defined?(diagnose_job)
+ # Get job data
+ job_query = { query: { term: { '_id' => params[:job_id] } }, size: 1 }
+ result = es_query(job_query)['hits']['hits']
+ return [404, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Job not found' }.to_json] if result.empty?
+
+ job_source = result.first
+ job_data = get_job_info(job_source)
+
+ # Perform diagnosis analysis
+ diagnosed_job = diagnose_job(job_data)
+ diagnosis_data = diagnosed_job['diagnosis']
+
+ if diagnosis_data
+ report_text = generate_diagnosis_report(job_data, diagnosis_data)
+ return [200, headers.merge('Access-Control-Allow-Origin' => '*'), {
+ 'report' => report_text,
+ 'job_id' => params[:job_id],
+ 'generated_at' => Time.now.to_s
+ }.to_json]
+ else
+ return [400, headers.merge('Access-Control-Allow-Origin' => '*'), {
+ 'error' => 'No diagnosis data available for this job',
+ 'job_id' => params[:job_id]
+ }.to_json]
+ end
+ end
+
+ [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Report generation not available' }.to_json]
+rescue StandardError => e
+ log_error({
+ 'message' => e.message,
+ 'error_message' => "diagnosis_generate_report error, input: #{params}"
+ })
+ [500, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Report generation failed' }.to_json]
+end
+
def get_job_query_range(condition_fields)
range = { start_time: {} }
start_date = condition_fields[:start_date]
@@ -401,13 +515,18 @@ def get_jobs_body(params)
jobs, total = search_job(params, page_size, page_num)
jobs, branches = get_optimize_jobs_branches(jobs)
- {
+
+ response = {
total: total,
filter: params,
banner: get_banner(params[:upstream_repo], branches),
jobs: jobs,
fields: FIELDS
- }.to_json
+ }
+
+ # Diagnosis feature enhancement (following project functional style)
+ response = enhance_response_with_diagnosis(response) if defined?(enhance_response_with_diagnosis)
+ response.to_json
end
def get_jobs(params)
@@ -426,6 +545,51 @@ def get_jobs(params)
[200, headers.merge('Access-Control-Allow-Origin' => '*'), body]
end
+# Get detailed information for a single job (including complete diagnosis information)
+def get_single_job_info(params)
+ payload = auth(params)
+ params[:my_account] = payload['my_account'] if payload and payload['my_account']
+
+ begin
+ job_id = params[:job_id]
+ return [400, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Missing job_id parameter'}.to_json] unless job_id
+
+ # Query single job
+ query = {
+ query: { term: { '_id' => job_id } },
+ size: 1
+ }
+
+ result = es_query(query)['hits']['hits']
+ return [404, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Job not found'}.to_json] if result.empty?
+
+ job_source = result.first
+ job_info = get_job_info(job_source)
+
+ # Add additional detailed information
+ job_info['_source_metadata'] = {
+ 'index' => job_source['_index'],
+ 'type' => job_source['_type'],
+ 'score' => job_source['_score']
+ }
+
+ # Diagnosis feature enhancement (following project functional style)
+ job_info = diagnose_job(job_info) if defined?(diagnose_job)
+
+ [200, headers.merge('Access-Control-Allow-Origin' => '*'), job_info.to_json]
+
+ rescue StandardError => e
+ log_error({
+ 'message' => e.message,
+ 'error_message' => "get_single_job_info error, input: #{params}"
+ })
+ return [500, headers.merge('Access-Control-Allow-Origin' => '*'),
+ {'error' => 'Failed to get job info', 'message' => e.message}.to_json]
+ end
+end
+
def get_repo_url(urls)
return unless urls.is_a?(Array)
@@ -757,13 +921,7 @@ def group_jobs_stats(params)
[200, headers.merge('Access-Control-Allow-Origin' => '*'), body]
end
-# -------------------------------------------------------------------------------------------
-# job error table like:
-# job_id error_id error_message result_root
-# -------------------------------------------------------------------------------------
-# crystal.630608 "stderr.xxx" "messag:xxxx" $result_root
-# ...
-# -------------------------------------------------------------------------------------------
+
def get_job_error(params)
payload = auth(params)
@@ -1391,7 +1549,7 @@ def get_job_info(job)
job_info['install_job_health'] = 'success'
end
- when /install-rpm\.(.*)_(?:cmd|service)_(.*)\.(.*)/
+ when /install-rpm\.(.*)_(?:cmd|service)_(.*)\.(.*)/
if $3 == 'fail'
job_info['install_job_health'] = 'fail'
return job_info
@@ -1400,6 +1558,9 @@ def get_job_info(job)
end
end
end
+
+ # Diagnosis feature enhancement (following project functional style)
+ job_info = diagnose_job(job_info) if defined?(diagnose_job)
job_info
end
--
Gitee