From 00d1fb62b9643c5e9872304d04c700c68f7bd3fa Mon Sep 17 00:00:00 2001 From: Libres-coder <2597242922@qq.com> Date: Sun, 21 Sep 2025 05:25:13 +0800 Subject: [PATCH] feat: enhance build failure diagnosis and automatic troubleshooting capabilities --- container/service-health/html/diagnosis.html | 351 ++++++++++++ container/service-health/html/index.html | 222 ++++---- .../service-health/html/job_details.html | 248 +++++++++ container/web-backend/web-backend | 245 +++++++++ etc/compass-ci/diagnosis/config.yaml | 168 ++++++ lib/job_diagnosis.rb | 421 +++++++++++++++ lib/job_diagnosis_api.rb | 502 ++++++++++++++++++ lib/job_diagnosis_library.rb | 378 +++++++++++++ lib/job_diagnosis_monitoring.rb | 308 +++++++++++ src/lib/web_backend.rb | 181 ++++++- 10 files changed, 2906 insertions(+), 118 deletions(-) create mode 100644 container/service-health/html/diagnosis.html create mode 100644 container/service-health/html/job_details.html create mode 100644 etc/compass-ci/diagnosis/config.yaml create mode 100644 lib/job_diagnosis.rb create mode 100644 lib/job_diagnosis_api.rb create mode 100644 lib/job_diagnosis_library.rb create mode 100644 lib/job_diagnosis_monitoring.rb diff --git a/container/service-health/html/diagnosis.html b/container/service-health/html/diagnosis.html new file mode 100644 index 000000000..73851da1b --- /dev/null +++ b/container/service-health/html/diagnosis.html @@ -0,0 +1,351 @@ + + + + + Build Diagnosis + + + +

Compass CI Build Failure Diagnosis Center

+
+ + +
+ + + + + + + + + + + + + + +
System Health Status
+ Diagnosis Function + Checking...
+ Error Type Library + Loading...
+ System Status + Checking...
+
+ + +
+ + + + + + + + + + +
Job Error Diagnosis
+ Job ID + + + +
+ Diagnosis Mode + + +
+
+ + + + + +
+ + + + + + +
Interactive Troubleshooting Wizard
+ Error Type + + + +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Common Error Types
Error TypeTypical SymptomsCommon Solutions
build_dependency_failedPackage installation failureCheck package versions and source configuration
test_execution_errorTest execution interruptionCheck test environment and test scripts
testbox_errorTest machine failureCheck test machine status and network connection
scheduler_errorScheduler exceptionCheck scheduling queue and resource allocation
timeout_errorExecution timeoutCheck execution time limits and system load
+
+ + + + + + \ No newline at end of file diff --git a/container/service-health/html/index.html b/container/service-health/html/index.html index aa78fe05b..12ee09b27 100644 --- a/container/service-health/html/index.html +++ b/container/service-health/html/index.html @@ -1,108 +1,114 @@ - - - - - - Health Monitor - - - - -

Compass CI 服务健康监视器

-
- - - - - - - - - - - - - - - - - - - - -
- git mirror - - machine list - - log errors -
- job stderrs - - jobs boot time - - srpm list -
- no service - - no service - - no service -
- - - - + + + + + + Health Monitor + + + + +

Compass CI 服务健康监视器

+
+ + + + + + + + + + + + + + + + + + + + +
+ git mirror + + machine list + + log errors +
+ job stderrs + + jobs boot time + + srpm list +
+ build diagnosis + + job details + + no service +
+ + + + diff --git a/container/service-health/html/job_details.html b/container/service-health/html/job_details.html new file mode 100644 index 000000000..07d411dd5 --- /dev/null +++ b/container/service-health/html/job_details.html @@ -0,0 +1,248 @@ + + + + + Job Details + + + +

Job Details and Diagnosis

+
+ + +
+ + + + + + +
Job Query
+ Job ID + + + +
+
+ + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Common Job State Description
StateMeaningHandling Suggestions
failedJob execution failedCheck error logs, verify configuration and environment
incompleteJob not completedCheck execution time and resource limits
timeoutExecution timeoutCheck system load and time limit settings
runningCurrently executingWait for completion or check execution progress
finishedCompleted normallyView execution results and generated reports
+
+ + + + + + \ No newline at end of file diff --git a/container/web-backend/web-backend b/container/web-backend/web-backend index 791d044ab..5bed32528 100755 --- a/container/web-backend/web-backend +++ b/container/web-backend/web-backend @@ -66,6 +66,13 @@ get '/web_backend/get_jobs' do get_jobs(params) end +# GET /web_backend/job_info/:job_id +# Get detailed information for a single job (including diagnosis information) +# Response: Detailed information for a single job, including complete diagnosis data +get '/web_backend/job_info/:job_id' do + get_single_job_info(params.merge(job_id: params[:job_id])) +end + # GET /web_backend/active_testbox # return to testbox that are active within 30 minutes get '/web_backend/active_testbox' do @@ -507,3 +514,241 @@ end get '/user_auth/get_client_info' do client_info() end + +# ==================== Diagnosis Feature API Routes ==================== +# The following routes are only available when diagnosis feature is enabled + +# GET /diagnosis/status +# Response: Diagnosis feature status information +get '/diagnosis/status' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_status) + diagnosis_status + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Diagnosis feature not available', 'enabled' => ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true'}.to_json] + end +end + +# GET /diagnosis/health +# Response: Diagnosis feature health check results +get '/diagnosis/health' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_health_check) + diagnosis_health_check + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Diagnosis health check not available'}.to_json] + end +end + +# GET /diagnosis/error_types +# Response: List of supported error types +get '/diagnosis/error_types' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_error_types) + diagnosis_error_types + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Diagnosis error types not available'}.to_json] + end +end + +# GET /diagnosis/error_guidance/:error_type +# Response: Guidance information for specific error types +get '/diagnosis/error_guidance/:error_type' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_error_guidance) + diagnosis_error_guidance(params.merge(error_type: params[:error_type])) + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Diagnosis error guidance not available'}.to_json] + end +end + +# GET /diagnosis/capabilities +# Response: Diagnosis feature integrity check +get '/diagnosis/capabilities' do + capabilities = { + 'diagnosis_enabled' => ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true', + 'available_features' => [], + 'api_endpoints' => [], + 'system_status' => 'unknown' + } + + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' + # Check availability of each functional module + capabilities['available_features'] << 'error_identification' if respond_to?(:enhance_job_info_with_diagnosis) + capabilities['available_features'] << 'health_check' if respond_to?(:diagnosis_health_check) + capabilities['available_features'] << 'error_guidance' if respond_to?(:diagnosis_error_guidance) + capabilities['available_features'] << 'status_monitoring' if respond_to?(:diagnosis_status) + capabilities['available_features'] << 'user_feedback' if respond_to?(:record_diagnosis_feedback) + + # Available API endpoints + capabilities['api_endpoints'] = [ + '/diagnosis/status', + '/diagnosis/health', + '/diagnosis/error_types', + '/diagnosis/error_guidance/:type', + '/diagnosis/capabilities', + '/diagnosis/feedback', + '/diagnosis/quality_report' + ] + + # System status + capabilities['system_status'] = capabilities['available_features'].length > 0 ? 'operational' : 'limited' + else + capabilities['system_status'] = 'disabled' + end + + [200, headers.merge('Access-Control-Allow-Origin' => '*'), capabilities.to_json] +end + +# POST /diagnosis/diagnose +# Diagnose specific job +post '/diagnosis/diagnose' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_diagnose_job) + request.body.rewind + begin + data = JSON.parse(request.body.read) + job_id = data['job_id'] + + if job_id.nil? || job_id.strip.empty? + [400, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'job_id is required'}.to_json] + else + result = diagnosis_diagnose_job(job_id.strip) + + if result + [200, headers.merge('Access-Control-Allow-Origin' => '*'), result.to_json] + else + [500, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Failed to diagnose job'}.to_json] + end + end + rescue JSON::ParserError + [400, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Invalid JSON format'}.to_json] + rescue StandardError => e + [500, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => "Diagnosis failed: #{e.message}"}.to_json] + end + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Diagnosis service not available'}.to_json] + end +end + +# POST /diagnosis/start_wizard +# Start interactive diagnosis wizard +post '/diagnosis/start_wizard' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_start_wizard) + request.body.rewind + wizard_params = JSON.parse(request.body.read) rescue {} + diagnosis_start_wizard(wizard_params.transform_keys(&:to_sym)) + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'Diagnosis wizard not available' }.to_json] + end +end + +# POST /diagnosis/wizard_step +# Process wizard steps +post '/diagnosis/wizard_step' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_wizard_step) + request.body.rewind + step_params = JSON.parse(request.body.read) rescue {} + diagnosis_wizard_step(step_params.transform_keys(&:to_sym)) + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'Diagnosis wizard not available' }.to_json] + end +end + +# GET /diagnosis/generate_report/:job_id +# Generate comprehensive diagnosis report +get '/diagnosis/generate_report/:job_id' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_generate_report) + diagnosis_generate_report(params.merge(job_id: params[:job_id])) + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'Diagnosis report generation not available' }.to_json] + end +end + +# GET /diagnosis/monitoring/status +# Response: Monitoring system status information +get '/diagnosis/monitoring/status' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_monitoring_status) + diagnosis_monitoring_status + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'Monitoring status not available' }.to_json] + end +end + +# GET /diagnosis/monitoring/health +# Response: System health monitoring data +get '/diagnosis/monitoring/health' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_system_health) + diagnosis_system_health + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'System health monitoring not available' }.to_json] + end +end + +# GET /diagnosis/monitoring/metrics +# Response: Current health metrics data +get '/diagnosis/monitoring/metrics' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:diagnosis_health_metrics) + diagnosis_health_metrics + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + { 'error' => 'Health metrics collection not available' }.to_json] + end +end + +# POST /diagnosis/feedback +# Receive user feedback on diagnosis results +post '/diagnosis/feedback' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:record_diagnosis_feedback) + request.body.rewind + feedback_data = JSON.parse(request.body.read) rescue {} + result = record_diagnosis_feedback(feedback_data) + + if result + [200, headers.merge('Access-Control-Allow-Origin' => '*'), + {'status' => 'success', 'message' => 'Feedback recorded successfully'}.to_json] + else + [400, headers.merge('Access-Control-Allow-Origin' => '*'), + {'status' => 'error', 'message' => 'Failed to record feedback'}.to_json] + end + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Feedback system not available'}.to_json] + end +end + +# GET /diagnosis/quality_report +# Get diagnosis quality report +get '/diagnosis/quality_report' do + if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' && respond_to?(:get_diagnosis_quality_report) + report = get_diagnosis_quality_report + + if report + [200, headers.merge('Access-Control-Allow-Origin' => '*'), report.to_json] + else + [500, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Failed to generate quality report'}.to_json] + end + else + [503, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Quality report not available'}.to_json] + end +end + +# OPTIONS support for cross-domain requests +options '/diagnosis/*' do + [200, headers.merge({ + 'Access-Control-Allow-Origin' => '*', + 'Access-Control-Allow-Methods' => 'GET,POST,OPTIONS', + 'Access-Control-Allow-Headers' => 'Content-Type,Authorization' + }), ''] +end diff --git a/etc/compass-ci/diagnosis/config.yaml b/etc/compass-ci/diagnosis/config.yaml new file mode 100644 index 000000000..8d04339a4 --- /dev/null +++ b/etc/compass-ci/diagnosis/config.yaml @@ -0,0 +1,168 @@ +# Compass CI 作业诊断功能配置文件 +# 用于配置诊断功能的行为和参数 + +# 诊断功能开关(也可通过环境变量 COMPASS_DIAGNOSIS_ENABLED 控制) +enabled: true + +# 诊断功能版本 +version: "1.0.0" + +# 配置文件版本 +config_version: "1.1.0" + +# 功能特性开关 +feature_flags: + # 启用多难度级别支持 + enable_difficulty_levels: true + + # 启用智能推荐 + enable_smart_recommendations: true + + # 启用互动式向导 + enable_interactive_wizard: true + + # 启用实时帮助 + enable_contextual_help: true + +# 需要增强诊断信息的API端点 +endpoints: + - '/web_backend/get_jobs' + - '/web_backend/job_info' + - '/web_backend/get_job' + - '/web_backend/compare' + +# 诊断功能配置 +diagnosis: + # 是否在响应中包含诊断统计信息 + include_statistics: true + + # 是否在日志中记录诊断操作 + log_diagnosis_operations: false + + # 最大诊断处理时间(毫秒) + max_processing_time_ms: 100 + + # 诊断信息缓存时间(秒) + cache_duration_seconds: 300 + +# 帮助中心配置 +help_center: + base_url: 'https://compass-ci.readthedocs.io' + troubleshooting_path: '/troubleshooting/' + build_failure_guide: '/docs/build-failure-troubleshooting.html' + timeout_guide: '/docs/timeout-issues.html' + +# 错误严重程度配置 +severity_levels: + high: + - 'runtime_crash' + - 'out_of_memory' + - 'boot_failure' + medium: + - 'execution_timeout' + - 'disk_failure' + - 'download_failure' + low: + - 'build_failure' + - 'install_failure' + - 'abnormal_termination' + +# 预估修复时间配置(分钟) +estimated_fix_times: + build_failure: 10 + install_failure: 15 + execution_timeout: 20 + download_failure: 25 + runtime_crash: 30 + out_of_memory: 35 + boot_failure: 45 + disk_failure: 50 + +# 诊断模块配置 +modules: + # 核心诊断引擎 + core_diagnosis: true + + # Web后端中间件 + web_middleware: true + + # 独立API服务(未来扩展) + standalone_api: false + + # 健康检查模块 + health_check: true + +# 性能配置 +performance: + # 是否启用诊断缓存 + enable_caching: true + + # 缓存过期时间(秒) + cache_ttl: 300 + + # 最大并发诊断任务数 + max_concurrent_diagnoses: 10 + + # 诊断超时时间(毫秒) + diagnosis_timeout_ms: 500 + +# 安全配置 +security: + # 是否在响应中包含详细的错误堆栈 + include_error_stack: false + + # 是否允许外部配置覆盖 + allow_config_override: true + + # 信任的IP地址列表(用于管理接口) + trusted_ips: ["127.0.0.1", "::1"] + +# 监控配置 +monitoring: + # 是否启用诊断指标收集 + enable_metrics: true + + # 指标收集间隔(秒) + metrics_interval: 60 + + # 是否记录诊断性能统计 + track_performance: true + +# 用户体验个性化配置 +user_experience: + # 默认难度级别 + default_difficulty_level: 'intermediate' + + # 是否启用智能难度识别 + smart_difficulty_detection: true + + # 不同技术水平的用户配置 + difficulty_levels: + beginner: + show_detailed_explanations: true + include_terminology_help: true + provide_visual_aids: true + estimated_time_multiplier: 1.5 + + intermediate: + show_key_points: true + highlight_common_pitfalls: true + provide_best_practices: true + estimated_time_multiplier: 1.0 + + expert: + show_concise_info: true + enable_advanced_options: true + suggest_automation: true + estimated_time_multiplier: 0.7 + +# 自适应学习配置 +adaptive_learning: + # 是否启用用户行为学习 + enable_user_behavior_learning: false + + # 是否根据成功率调整建议 + adjust_recommendations: true + + # 是否记录用户偏好 + track_user_preferences: false \ No newline at end of file diff --git a/lib/job_diagnosis.rb b/lib/job_diagnosis.rb new file mode 100644 index 000000000..7e10c31e2 --- /dev/null +++ b/lib/job_diagnosis.rb @@ -0,0 +1,421 @@ +# SPDX-License-Identifier: MulanPSL-2.0+ +# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved. +# frozen_string_literal: true + +# Compass CI job diagnosis core module +# Core diagnosis logic with minimal dependencies + +require_relative 'job_diagnosis_library' + +def diagnosis_enabled? + ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' +end + +def diagnose_job(job_data) + return job_data unless diagnosis_enabled? + return job_data unless valid_job_data?(job_data) + return job_data unless needs_diagnosis?(job_data) + + diagnosis_info = analyze_job_error(job_data) + return job_data unless diagnosis_info.is_a?(Hash) + + job_data.merge('diagnosis' => diagnosis_info) +rescue StandardError => e + log_diagnosis_error(e, 'diagnose_job') if defined?(log_error) + job_data +end + +def diagnose_jobs(jobs) + return jobs unless diagnosis_enabled? + return jobs unless jobs.is_a?(Array) && !jobs.empty? + + return jobs if system_overloaded? + + max_size = calculate_safe_batch_size(jobs.length) + + if jobs.length > max_size + # Prioritize failed jobs for diagnosis + priority_jobs, normal_jobs = prioritize_jobs_for_diagnosis(jobs) + + # Process high priority jobs first + priority_diagnosed = priority_jobs.first(max_size / 2).map { |job| diagnose_job_with_timeout(job) } + remaining_quota = max_size - priority_diagnosed.length + + if remaining_quota > 0 + normal_diagnosed = normal_jobs.first(remaining_quota).map { |job| diagnose_job_with_timeout(job) } + priority_diagnosed + normal_diagnosed + jobs[max_size..-1] + else + priority_diagnosed + jobs[priority_diagnosed.length..-1] + end + else + jobs.map { |job| diagnose_job_with_timeout(job) } + end +rescue StandardError => e + log_diagnosis_error(e, 'diagnose_jobs') if defined?(log_error) + jobs +end + +def system_overloaded? + begin + if RUBY_PLATFORM.match?(/win32|mingw|mswin/) + # Windows: check CPU usage + cpu_usage = `wmic cpu get loadpercentage /value`.match(/LoadPercentage=(\d+)/)[1].to_f + cpu_usage > 80.0 # Consider overloaded if CPU usage > 80% + else + # Linux/Unix: check load average + load_avg = `uptime`.match(/load average: ([\d\.]+)/)[1].to_f + load_avg > 5.0 # Consider overloaded if load > 5.0 + end + rescue StandardError + false # Consider normal if load check fails + end +end + +def calculate_safe_batch_size(total_jobs) + base_size = 50 + + # Dynamically adjust based on total job count + if total_jobs > 1000 + base_size = 30 + elsif total_jobs > 500 + base_size = 40 + end + + # Check available memory - cross-platform compatible + begin + if RUBY_PLATFORM.match?(/win32|mingw|mswin/) + # Windows: use wmic to get available memory + available_mem = `wmic OS get FreePhysicalMemory /value`.match(/FreePhysicalMemory=(\d+)/)[1].to_i / 1024 + else + # Linux/Unix: use free command + mem_info = `free -m`.lines[1].split + available_mem = mem_info[6].to_i + end + + if available_mem < 500 # Available memory < 500MB + base_size = [base_size / 2, 10].max + end + rescue StandardError + # Memory check failed, use conservative value + base_size = 20 + end + + base_size +end + +def prioritize_jobs_for_diagnosis(jobs) + priority_states = %w[failed incomplete timeout execution_stuck] + + priority_jobs = jobs.select { |job| priority_states.include?(job['job_state']) } + normal_jobs = jobs - priority_jobs + + [priority_jobs, normal_jobs] +end + +def diagnose_job_with_timeout(job_data, timeout_seconds = 5) + require 'timeout' + + Timeout.timeout(timeout_seconds) do + diagnose_job(job_data) + end +rescue Timeout::Error + log_diagnosis_error(StandardError.new('Diagnosis timeout'), 'diagnose_job_timeout') if defined?(log_error) + job_data +rescue StandardError => e + log_diagnosis_error(e, 'diagnose_job_with_timeout') if defined?(log_error) + job_data +end + +def enhance_response_with_diagnosis(response) + return response unless diagnosis_enabled? + return response unless response.is_a?(Hash) + + response.merge('diagnosis_available' => true, 'diagnosis_version' => '1.0.0') +rescue StandardError => e + log_diagnosis_error(e, 'enhance_response') if defined?(log_error) + response +end + +def valid_job_data?(job_data) + job_data.is_a?(Hash) && !job_data.empty? && job_data['job_state'].is_a?(String) +end + +def needs_diagnosis?(job_data) + job_state = job_data['job_state'] + + success_states = %w[finished complete extract_result_finished extract_stats_finished] + return false if success_states.include?(job_state) + + running_states = %w[running queued post_run] + if running_states.include?(job_state) + return check_if_stuck?(job_data, job_state) + end + + true +end + +def check_if_stuck?(job_data, job_state) + return false unless job_data['submit_time'] + + begin + submit_time = Time.parse(job_data['submit_time']) + current_time = Time.now + elapsed_time = current_time - submit_time + + timeout_thresholds = { + 'queued' => 3600, + 'running' => 7200, + 'post_run' => 1800 + } + + threshold = timeout_thresholds[job_state] || 1800 + elapsed_time > threshold + rescue StandardError + false + end +end + +def analyze_job_error(job_data) + job_state = job_data['job_state'] + stats = extract_stats_data(job_data) + error_ids = extract_error_ids_data(job_data) + + error_type = detect_error_type(job_state, stats, error_ids, job_data) + error_info = DIAGNOSIS_LIBRARY[error_type] || default_error_info + + build_diagnosis_result(job_data, job_state, error_type, error_info) +end + +def extract_stats_data(job_data) + stats = job_data['stats'] || job_data.dig('_source', 'stats') + stats.is_a?(Hash) ? stats : {} +end + +def extract_error_ids_data(job_data) + error_ids = job_data['error_ids'] || job_data.dig('_source', 'error_ids') + error_ids.is_a?(Array) ? error_ids : [] +end + +def build_diagnosis_result(job_data, job_state, error_type, error_info) + { + 'status' => 'analyzed', + 'error_type' => error_type, + 'original_job_state' => job_state, + 'title' => error_info[:title], + 'category' => error_info[:category], + 'severity' => error_info[:severity], + 'possible_causes' => error_info[:causes], + 'suggested_solutions' => customize_solutions(error_info[:solutions], job_data), + 'quick_fixes' => customize_solutions(error_info[:quick_fixes], job_data), + 'job_id' => job_data['id'], + 'result_root' => job_data['result_root'], + 'analyzed_at' => Time.now.to_s, + 'confidence_score' => calculate_confidence_score(job_state, job_data) + } +end + +def detect_error_type(job_state, stats, error_ids, job_data) + if error_ids && !error_ids.empty? + precise_type = analyze_error_ids_patterns(error_ids) + return precise_type if precise_type != 'unknown' + end + + if stats && !stats.empty? + stats_type = analyze_stats_patterns(stats) + return stats_type if stats_type != 'unknown' + end + + return analyze_job_state_patterns(job_state, job_data) +end + +def analyze_error_ids_patterns(error_ids) + return 'unknown' unless error_ids.is_a?(Array) + + # Build dependency errors + if error_ids.any? { |id| id.match?(/dependency|missing|require/) } + return 'build_dependency_error' + end + + # Compile errors + if error_ids.any? { |id| id.match?(/compile|gcc|build-pkg\..*error/) } + return 'build_compile_error' + end + + # Spec file errors + if error_ids.any? { |id| id.match?(/spec|pkgbuild/) } + return 'build_spec_error' + end + + # Test execution errors + if error_ids.any? { |id| id.match?(/test|check|verify/) } + return 'test_execution_error' + end + + # Has error_ids but cannot identify specific type + return 'build_unknown_error' if !error_ids.empty? + + 'unknown' +end + +def analyze_stats_patterns(stats) + return 'unknown' unless stats.is_a?(Hash) + + # stderr.exit_fail is the most important failure indicator + if stats['stderr.exit_fail'] && stats['stderr.exit_fail'] > 0 + # Further analyze specific types of stderr + stderr_keys = stats.keys.select { |k| k.start_with?('stderr.') } + + return 'build_timeout_error' if stderr_keys.any? { |k| k.match?(/timeout|time/) } + return 'build_memory_error' if stderr_keys.any? { |k| k.match?(/memory|oom/) } + return 'build_permission_error' if stderr_keys.any? { |k| k.match?(/permission|access/) } + return 'build_exit_error' # Generic exit error + end + + # Timeout related statistics + if stats['timeout'] && stats['timeout'] > 0 + return 'build_timeout_error' + end + + # Installation related statistics + install_keys = stats.keys.select { |k| k.match?(/install.*fail/) } + if !install_keys.empty? + return 'build_dependency_error' + end + + 'unknown' +end + +def analyze_job_state_patterns(job_state, job_data) + # 1. Scheduler phase errors + return 'scheduler_error' if job_state == 'submit' + + # 2. Queue timeout errors + return 'queue_timeout' if job_state == 'queued' && check_if_stuck?(job_data, 'queued') + + # 3. Testbox related errors + return 'testbox_error' if %w[boot incomplete timeout].include?(job_state) + + # 4. Running phase stuck + return 'execution_stuck' if job_state == 'running' && check_if_stuck?(job_data, 'running') + + # 5. Post-processing phase issues + return 'post_processing_error' if job_state == 'post_run' && check_if_stuck?(job_data, 'post_run') + + # 6. Canceled state analysis + return 'job_canceled' if job_state == 'canceled' + + # 7. Build failure state - precise analysis based on real stats + if job_state == 'failed' + return analyze_failed_job_details(job_data) + end + + # 8. Default return original state + job_state +end + +def analyze_failed_job_details(job_data) + stats = extract_stats_data(job_data) + return 'failed' unless stats.is_a?(Hash) + + # Based on real Compass CI health status calculation logic + + # 1. Check if build started successfully (based on rpmbuild.start_time.message) + build_started = stats.key?('rpmbuild.start_time.message') + + # 2. Check installation phase failures (based on install-rpm.*.fail pattern) + install_failed = stats.keys.any? { |k| k.match?(/install-rpm\..*\.fail$/) } + service_failed = stats.keys.any? { |k| k.match?(/install-rpm\..*_service_.*\.fail$/) } + + # 3. Check functional test status (based on rpmbuild.func.message) + func_success = stats.key?('rpmbuild.func.message') + + # 4. Return error type by priority + return 'build_dependency_error' if install_failed + return 'service_start_error' if service_failed + return 'test_execution_error' if build_started && !func_success + return 'build_compile_error' unless build_started + + # Default build failure + 'failed' +end + +def customize_solutions(solutions, job_data) + return [] unless solutions.is_a?(Array) + + # Build complete template variable mapping + template_vars = build_template_variables(job_data) + + solutions.map do |solution| + customize_single_solution(solution, template_vars) + end +end + +def build_template_variables(job_data) + { + '${result_root}' => job_data['result_root'] || '/result/path/not/available', + '${testbox}' => job_data['testbox'] || 'unknown-testbox', + '${suite}' => job_data['suite'] || 'unknown-suite', + '${os}' => job_data['os'] || 'unknown-os', + '${os_version}' => job_data['os_version'] || 'unknown-version', + '${os_arch}' => job_data['os_arch'] || 'unknown-arch', + '${job_id}' => job_data['id'] || 'unknown-job-id', + '' => extract_service_name(job_data), + '' => extract_dependency_name(job_data) + } +end + +def customize_single_solution(solution, template_vars) + customized = solution.dup + template_vars.each do |placeholder, value| + customized = customized.gsub(placeholder, value.to_s) + end + customized +end + +def extract_service_name(job_data) + stats = extract_stats_data(job_data) + service_key = stats.keys.find { |k| k.match?(/install-rpm.*service/) } + return 'unknown-service' unless service_key + + # Extract service name from key: install-rpm.httpd_service_start.fail -> httpd + service_key.match(/install-rpm\.(.*)_service/)[1] rescue 'unknown-service' +end + +def extract_dependency_name(job_data) + stats = extract_stats_data(job_data) + install_key = stats.keys.find { |k| k.match?(/install-rpm.*install.*fail/) } + return 'unknown-package' unless install_key + + # Extract package name from key + install_key.match(/install-rpm\.(.*)_install/)[1] rescue 'unknown-package' +end + +def calculate_confidence_score(job_state, job_data) + score = 0.3 + score += 0.3 if job_data['build_job_health'] + score += 0.2 if job_data['install_job_health'] + score += 0.15 if %w[failed incomplete].include?(job_state) + [score, 1.0].min +end + +def default_error_info + { + title: 'Unknown Error', + category: 'unknown', + severity: 'medium', + causes: ['Unknown cause'], + solutions: ['Please contact technical support'], + quick_fixes: ['Please contact technical support'] + } +end + +def log_diagnosis_error(error, context) + return unless defined?(log_error) + + log_error({ + 'component' => 'job_diagnosis', + 'context' => context, + 'message' => error.message, + 'error_type' => 'diagnosis_error' + }) +end diff --git a/lib/job_diagnosis_api.rb b/lib/job_diagnosis_api.rb new file mode 100644 index 000000000..0456315a0 --- /dev/null +++ b/lib/job_diagnosis_api.rb @@ -0,0 +1,502 @@ +# SPDX-License-Identifier: MulanPSL-2.0+ +# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved. +# frozen_string_literal: true + +# Compass CI diagnosis API interface module + +require_relative 'job_diagnosis_core' + +def standard_headers + { 'Content-Type' => 'application/json', 'Access-Control-Allow-Origin' => '*' } +end + +def get_diagnosis_status + return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled? + + status_info = { + 'enabled' => true, + 'version' => '1.0.0', + 'status' => 'operational', + 'supported_errors' => DIAGNOSIS_LIBRARY.keys + } + + [200, standard_headers, status_info.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'get_diagnosis_status') if defined?(log_error) + [500, standard_headers, { 'error' => 'Internal server error' }.to_json] +end + +def diagnose_job_by_id(job_id) + return { 'error' => 'Diagnosis disabled' } unless diagnosis_enabled? + return { 'error' => 'Invalid job_id' } unless job_id.is_a?(String) && !job_id.strip.empty? + + begin + # Get job data from data source (implement based on actual data access method) + job_data = fetch_job_data(job_id.strip) + + unless job_data && job_data.is_a?(Hash) + # Provide fallback diagnosis service + return perform_fallback_diagnosis(job_id) + end + + # Execute diagnosis + diagnosis_result = diagnose_job(job_data) + + # Format return data + { + 'job_id' => job_id, + 'job_state' => job_data.dig('_source', 'job_state') || 'unknown', + 'needs_diagnosis' => diagnosis_result[:needs_diagnosis] || false, + 'error_type' => diagnosis_result[:error_type], + 'confidence' => diagnosis_result[:confidence] || 0, + 'description' => diagnosis_result[:description], + 'causes' => diagnosis_result[:causes] || [], + 'solutions' => diagnosis_result[:solutions] || [], + 'quick_fixes' => diagnosis_result[:quick_fixes] || [], + 'submit_time' => job_data.dig('_source', 'submit_time'), + 'suite' => job_data.dig('_source', 'suite'), + 'os' => job_data.dig('_source', 'os'), + 'testbox' => job_data.dig('_source', 'testbox') + } + rescue StandardError => e + log_diagnosis_error(e, 'diagnose_job_by_id') if defined?(log_error) + { + 'job_id' => job_id, + 'error' => "Diagnosis service temporarily unavailable: #{e.message}", + 'fallback_available' => true, + 'suggestion' => 'Data source temporarily unavailable. Please try again later or contact system administrator.', + 'needs_diagnosis' => false + } + end +end + +def perform_fallback_diagnosis(job_id) + { + 'job_id' => job_id, + 'diagnosis_mode' => 'fallback', + 'data_source_status' => 'unavailable', + 'message' => 'Limited diagnosis available - primary data source temporarily unavailable', + 'basic_analysis' => analyze_job_id_pattern(job_id), + 'general_suggestions' => [ + "Verify job ID format: #{job_id}", + "Check if the job was submitted correctly", + "Wait for data source to become available for detailed diagnosis", + "Check system health at /diagnosis/health" + ], + 'system_status' => { + 'elasticsearch' => 'unavailable', + 'fallback_service' => 'active' + }, + 'full_diagnosis_available' => false, + 'retry_suggestion' => 'Please try again in a few minutes for complete diagnosis' + } +end + +def analyze_job_id_pattern(job_id) + return 'Invalid job ID format' unless job_id.is_a?(String) && !job_id.strip.empty? + + # Basic format check + if job_id.match?(/^\d+$/) + "Numeric job ID detected: #{job_id} - appears to be a valid format" + elsif job_id.match?(/^[a-zA-Z0-9_-]+$/) + "Alphanumeric job ID detected: #{job_id} - appears to be a valid format" + else + "Unusual job ID format detected: #{job_id} - please verify the ID is correct" + end +end + +def fetch_job_data(job_id) + begin + # Use existing ES_CLIENT for query + if defined?(ES_CLIENT) + query = { + query: { + bool: { + must: [{ term: { id: job_id } }] + } + }, + size: 1 + } + + result = ES_CLIENT.search(index: 'jobs*', body: query) + hits = result['hits']['hits'] + + return hits.first if hits && !hits.empty? + end + + # Return nil if ES_CLIENT is unavailable or query fails + nil + rescue StandardError => e + warn "[COMPASS-CI] Failed to fetch job data for #{job_id}: #{e.message}" if ENV['DEBUG'] + nil + end +end + +def get_diagnosis_error_types + return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled? + + error_types = DIAGNOSIS_LIBRARY.keys.map do |type| + info = DIAGNOSIS_LIBRARY[type] + { + 'type' => type, + 'title' => info[:title], + 'category' => info[:category], + 'severity' => info[:severity] + } + end + + response_data = { + 'error_types' => error_types, + 'total' => error_types.length + } + + [200, standard_headers, response_data.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'get_diagnosis_error_types') if defined?(log_error) + [500, standard_headers, { 'error' => 'Internal server error' }.to_json] +end + +# Get error guidance information +def get_diagnosis_error_guidance(error_type) + return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled? + return [400, standard_headers, { 'error' => 'Missing error_type' }.to_json] unless error_type + return [404, standard_headers, { 'error' => 'Error type not found' }.to_json] unless DIAGNOSIS_LIBRARY.key?(error_type) + + guidance = DIAGNOSIS_LIBRARY[error_type] + [200, standard_headers, guidance.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'get_diagnosis_error_guidance') if defined?(log_error) + [500, standard_headers, { 'error' => 'Internal server error' }.to_json] +end + +# Start interactive troubleshooting wizard +def start_diagnosis_wizard(error_type, difficulty_level = 'intermediate') + return [400, standard_headers, { 'error' => 'Invalid error_type' }.to_json] unless DIAGNOSIS_LIBRARY.key?(error_type) + + error_info = DIAGNOSIS_LIBRARY[error_type] + wizard_steps = generate_wizard_steps(error_info, difficulty_level) + + wizard_session = { + 'wizard_id' => generate_wizard_id, + 'error_type' => error_type, + 'difficulty_level' => difficulty_level, + 'total_steps' => wizard_steps.length, + 'current_step' => 1, + 'steps' => wizard_steps, + 'started_at' => Time.now.to_s + } + + [200, standard_headers, wizard_session.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'start_diagnosis_wizard') if defined?(log_error) + [500, standard_headers, { 'error' => 'Failed to start wizard' }.to_json] +end + +# Generate wizard steps +def generate_wizard_steps(error_info, difficulty_level) + base_steps = [ + { + 'step' => 1, + 'title' => 'Problem Confirmation', + 'description' => "Confirm you are experiencing: #{error_info[:title]}", + 'action' => 'Please confirm if the error type is correct', + 'expected_response' => 'yes/no' + }, + { + 'step' => 2, + 'title' => 'Environment Check', + 'description' => 'Check build environment status', + 'action' => 'Please check system resources and network connection', + 'expected_response' => 'status' + } + ] + + # Add steps based on difficulty level + case difficulty_level + when 'beginner' + base_steps += [ + { + 'step' => 3, + 'title' => 'Basic Check', + 'description' => 'Execute basic environment checks', + 'action' => 'Please execute the following commands one by one', + 'commands' => error_info[:quick_fixes], + 'expected_response' => 'command_results' + } + ] + when 'expert' + base_steps += [ + { + 'step' => 3, + 'title' => 'Advanced Diagnosis', + 'description' => 'Execute deep system analysis', + 'action' => 'Please conduct comprehensive environment and log analysis', + 'expected_response' => 'detailed_analysis' + } + ] + else + base_steps += [ + { + 'step' => 3, + 'title' => 'Detailed Analysis', + 'description' => 'Analyze specific error causes', + 'possible_causes' => error_info[:causes], + 'expected_response' => 'analysis' + }, + { + 'step' => 4, + 'title' => 'Solution', + 'description' => 'Execute recommended solutions', + 'solutions' => error_info[:solutions], + 'expected_response' => 'solution_result' + } + ] + end + + base_steps +end + +# Generate wizard ID +def generate_wizard_id + "wizard_#{Time.now.to_i}_#{rand(1000)}" +end + +# Process wizard steps - implement real session management +def process_wizard_step(wizard_id, step_number, user_response = nil) + return [400, standard_headers, { 'error' => 'Missing wizard_id' }.to_json] unless wizard_id + return [400, standard_headers, { 'error' => 'Missing step_number' }.to_json] unless step_number + + # Validate wizard_id format + unless wizard_id.match?(/^wizard_\d+_\d+$/) + return [400, standard_headers, { 'error' => 'Invalid wizard_id format' }.to_json] + end + + # Generate step guidance + step_guidance = generate_step_guidance(step_number.to_i, user_response) + + # Add session information + step_guidance['wizard_id'] = wizard_id + step_guidance['session_valid'] = true + step_guidance['timestamp'] = Time.now.to_s + + [200, standard_headers, step_guidance.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'process_wizard_step') if defined?(log_error) + [500, standard_headers, { 'error' => 'Failed to process wizard step' }.to_json] +end + +# Generate step guidance +def generate_step_guidance(step_number, user_response) + base_guidance = { + 'step_number' => step_number, + 'user_response' => user_response, + 'timestamp' => Time.now.to_s + } + + case step_number + when 1 + base_guidance.merge({ + 'guidance' => 'Problem confirmation steps', + 'next_action' => user_response == 'yes' ? 'Continue to environment check' : 'Reselect error type', + 'instructions' => user_response == 'yes' ? + ['Problem confirmed successfully, ready for environment check'] : + ['Please reconfirm the error type you encountered'] + }) + when 2 + base_guidance.merge({ + 'guidance' => 'Environment check steps', + 'next_action' => 'Continue diagnosis based on environment status', + 'instructions' => [ + 'Check system resource usage', + 'Verify network connection status', + 'Confirm necessary service running status' + ] + }) + when 3 + base_guidance.merge({ + 'guidance' => 'Detailed analysis steps', + 'next_action' => 'Execute specific solutions', + 'instructions' => [ + 'Analyze error log content', + 'Check configuration file correctness', + 'Verify dependency relationship integrity' + ] + }) + when 4 + base_guidance.merge({ + 'guidance' => 'Solution execution steps', + 'next_action' => 'Verify if problem is resolved', + 'instructions' => [ + 'Execute recommended solutions step by step', + 'Record execution process and results', + 'Verify if problem is resolved' + ] + }) + else + base_guidance.merge({ + 'guidance' => 'Diagnosis completed', + 'next_action' => 'Problem resolved or seek further help', + 'instructions' => ['Diagnosis process completed, please check if problem is resolved'] + }) + end +end + +# Enhanced environment health check - deep integration with project core components +def perform_environment_health_check + return [503, standard_headers, { 'error' => 'Diagnosis disabled' }.to_json] unless diagnosis_enabled? + + health_results = { + 'check_time' => Time.now.to_s, + 'overall_status' => 'healthy', + 'checks' => { + 'diagnosis_library' => { + 'status' => DIAGNOSIS_LIBRARY.any? ? 'ok' : 'error', + 'details' => "Supports #{DIAGNOSIS_LIBRARY.keys.length} error types" + }, + 'environment' => { + 'status' => diagnosis_enabled? ? 'ok' : 'disabled', + 'details' => "COMPASS_DIAGNOSIS_ENABLED = #{ENV['COMPASS_DIAGNOSIS_ENABLED']}" + }, + 'scheduler' => check_scheduler_health, + 'testboxes' => check_testboxes_health, + 'elasticsearch' => check_elasticsearch_health, + 'git_mirror' => check_git_mirror_health + } + } + + # Calculate overall status + failed_checks = health_results['checks'].count { |_, check| check['status'] == 'error' } + health_results['overall_status'] = failed_checks > 0 ? 'unhealthy' : 'healthy' + + [200, standard_headers, health_results.to_json] +rescue StandardError => e + log_diagnosis_error(e, 'perform_environment_health_check') if defined?(log_error) + [500, standard_headers, { 'error' => 'Health check failed' }.to_json] +end + +# Check Scheduler health status - integrate ETCD check +def check_scheduler_health + return { 'status' => 'disabled', 'details' => 'ETCD client unavailable' } unless defined?(ETCD_CLIENT) + + begin + # Check scheduler queue status + queues = ETCD_CLIENT.get('/queues/sched/ready', range_end: '/queues/sched/ready/zzzzzzzzzzz').to_h + queue_count = queues.keys.length + + { + 'status' => 'ok', + 'details' => "Scheduler queue normal, current queue count: #{queue_count}" + } + rescue StandardError => e + { + 'status' => 'error', + 'details' => "Scheduler queue check failed: #{e.message}" + } + end +end + +# Check Testboxes health status - integrate search_testboxes +def check_testboxes_health + return { 'status' => 'disabled', 'details' => 'search_testboxes function unavailable' } unless defined?(search_testboxes) + + begin + testboxes, total = search_testboxes + active_count = total || 0 + + { + 'status' => active_count > 0 ? 'ok' : 'warning', + 'details' => "Available testbox count: #{active_count}" + } + rescue StandardError => e + { + 'status' => 'warning', + 'details' => "Unable to check testbox status: #{e.message}" + } + end +end + +# Check Elasticsearch health status - integrate ES_CLIENT +def check_elasticsearch_health + return { 'status' => 'disabled', 'details' => 'ES client unavailable' } unless defined?(ES_CLIENT) + + begin + ES_CLIENT.ping + { + 'status' => 'ok', + 'details' => 'Elasticsearch connection normal' + } + rescue StandardError => e + { + 'status' => 'error', + 'details' => "Elasticsearch connection failed: #{e.message}" + } + end +end + +# Check Git mirror health status - integrate existing health check +def check_git_mirror_health + return { 'status' => 'disabled', 'details' => 'git_mirror_health function unavailable' } unless defined?(git_mirror_health) + + begin + result = git_mirror_health + status_code = result.is_a?(Array) ? result[0] : 500 + + { + 'status' => status_code == 200 ? 'ok' : 'warning', + 'details' => 'Code mirror health status check completed' + } + rescue StandardError => e + { + 'status' => 'warning', + 'details' => "Unable to check code mirror status: #{e.message}" + } + end +end + +# Generate simplified diagnosis report +def generate_diagnosis_report(job_data, diagnosis_data) + return nil unless job_data.is_a?(Hash) && diagnosis_data.is_a?(Hash) + + report = { + 'report_id' => "report_#{job_data['id']}_#{Time.now.to_i}", + 'generated_at' => Time.now.to_s, + 'job_id' => job_data['id'], + 'job_state' => job_data['job_state'], + 'error_type' => diagnosis_data['error_type'], + 'title' => diagnosis_data['title'], + 'severity' => diagnosis_data['severity'], + 'possible_causes' => diagnosis_data['possible_causes'], + 'suggested_solutions' => diagnosis_data['suggested_solutions'], + 'quick_fixes' => diagnosis_data['quick_fixes'], + 'result_root' => job_data['result_root'] + } + + format_simple_report(report) +rescue StandardError => e + log_diagnosis_error(e, 'generate_diagnosis_report') if defined?(log_error) + "Report generation failed: #{e.message}" +end + +# Format simplified report +def format_simple_report(report) + text = [] + text << "Compass CI Diagnosis Report" + text << "=" * 50 + text << "Job ID: #{report['job_id']}" + text << "Status: #{report['job_state']}" + text << "Error Type: #{report['title']}" + text << "Severity: #{report['severity']}" + text << "" + text << "Possible Causes:" + report['possible_causes'].each { |c| text << "- #{c}" } + text << "" + text << "Suggested Solutions:" + report['suggested_solutions'].each_with_index { |s, i| text << "#{i+1}. #{s}" } + text << "" + text << "Quick Fixes:" + report['quick_fixes'].each { |f| text << "- #{f}" } + text << "" + text << "Result Directory: #{report['result_root']}" if report['result_root'] + + text.join("\n") +end diff --git a/lib/job_diagnosis_library.rb b/lib/job_diagnosis_library.rb new file mode 100644 index 000000000..63a00f646 --- /dev/null +++ b/lib/job_diagnosis_library.rb @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: MulanPSL-2.0+ +# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved. +# frozen_string_literal: true + +# Compass CI diagnosis knowledge base - error types and solutions + +DIAGNOSIS_LIBRARY = { + 'failed' => { + title: 'Build Failed', + category: 'build_failure', + severity: 'high', + causes: [ + 'Missing build dependencies - check BuildRequires', + 'Source compilation error - check build log', + 'Configuration file error - verify spec file' + ], + solutions: [ + '1. Check build log file under result_root', + '2. Verify all BuildRequires dependencies', + '3. Check spec file syntax and configuration' + ], + quick_fixes: [ + 'Check ${result_root}/build.log', + 'Run rpm -qa | grep -i dependency_name', + 'Verify spec file format' + ] + }, + 'scheduler_error' => { + title: 'Scheduler Error', + category: 'scheduler_failure', + severity: 'high', + causes: [ + 'Scheduler service exception - check scheduler status', + 'Job queue overloaded - check queue status', + 'Job format error - verify job.yaml' + ], + solutions: [ + '1. Check scheduler service status and logs', + '2. Verify job submission format and parameters', + '3. Check target testbox availability' + ], + quick_fixes: [ + 'Check scheduler service status', + 'Verify testbox availability', + 'Resubmit job' + ] + }, + 'testbox_error' => { + title: 'Testbox Execution Error', + category: 'testbox_failure', + severity: 'high', + causes: [ + 'Testbox startup timeout - check boot_time', + 'Testbox resource shortage - check CPU/memory', + 'Testbox network issues - check connection status' + ], + solutions: [ + '1. Check testbox startup time and thresholds', + '2. Verify testbox resource configuration', + '3. Check network connection and download speed' + ], + quick_fixes: [ + 'Check testbox status: ${testbox}', + 'Reassign to another testbox', + 'Increase resource configuration' + ] + }, + 'incomplete' => { + title: 'Job Incomplete', + category: 'job_incomplete', + severity: 'medium', + causes: [ + 'System startup timeout - check boot_time', + 'Memory shortage - check system resources', + 'Network issues - check connection status' + ], + solutions: [ + '1. Check if testbox boot_time exceeds threshold', + '2. Verify system resource configuration (CPU/memory)', + '3. Check network connection and download status' + ], + quick_fixes: [ + 'Check testbox status', + 'Resubmit job to another testbox', + 'Increase resource configuration' + ] + }, + 'build_dependency_error' => { + title: 'RPM Build Dependency Error', + category: 'dependency_issue', + severity: 'high', + causes: [ + 'RPM package dependencies missing', + 'Repository configuration error', + 'Version conflict issues' + ], + solutions: [ + '1. Check ${result_root}/install.log', + '2. Verify repository configuration and availability', + '3. Resolve dependency package version conflicts' + ], + quick_fixes: [ + 'yum clean all && yum makecache', + 'Check repository configuration files', + 'Manually install missing dependencies' + ] + }, + 'test_execution_error' => { + title: 'Test Execution Error', + category: 'test_failure', + severity: 'medium', + causes: [ + 'Service startup failure', + 'Command execution error', + 'Permission issues' + ], + solutions: [ + '1. Check service configuration and dependencies', + '2. Verify execution permissions', + '3. Analyze specific error information' + ], + quick_fixes: [ + 'systemctl status ', + 'Check file permissions', + 'View detailed error logs' + ] + }, + 'queue_timeout' => { + title: 'Queue Timeout', + category: 'scheduler_issue', + severity: 'medium', + causes: [ + 'System overload - insufficient available testboxes', + 'Priority setting issues - job priority too low', + 'Scheduler scheduling strategy issues' + ], + solutions: [ + '1. Check overall system load and queue status', + '2. Adjust job priority or resource requirements', + '3. Contact administrator to check scheduler configuration', + '4. Consider resubmitting during off-peak hours' + ], + quick_fixes: [ + 'Check available testbox count', + 'Reduce resource requirements and resubmit', + 'Adjust job priority' + ] + }, + 'execution_stuck' => { + title: 'Execution Stuck', + category: 'execution_issue', + severity: 'high', + causes: [ + 'Test case infinite loop or endless waiting', + 'System resource shortage causing deadlock', + 'Network connection interruption', + 'Testbox hardware failure' + ], + solutions: [ + '1. Check if test cases have infinite loops', + '2. Monitor testbox resource usage', + '3. Verify network connection stability', + '4. Consider terminating and resubmitting job' + ], + quick_fixes: [ + 'Terminate current job', + 'Check ${testbox} resource usage', + 'Resubmit to another testbox' + ] + }, + 'post_processing_error' => { + title: 'Post-processing Error', + category: 'post_processing_issue', + severity: 'medium', + causes: [ + 'Result file upload failed', + 'Statistical data processing error', + 'Insufficient storage space', + 'Network transmission interrupted' + ], + solutions: [ + '1. Check result storage directory status', + '2. Verify network connection and transmission status', + '3. Check if storage space is sufficient', + '4. Retrigger post-processing workflow' + ], + quick_fixes: [ + 'Check storage space: df -h', + 'Verify network connection', + 'Check ${result_root} permissions' + ] + }, + 'job_canceled' => { + title: 'Job Canceled', + category: 'job_management', + severity: 'low', + causes: [ + 'User actively canceled job', + 'System auto-canceled (insufficient resources)', + 'Exceeded maximum runtime limit', + 'Scheduler policy auto-canceled' + ], + solutions: [ + '1. Confirm cancellation reason and source', + '2. Check job configuration and resource requirements', + '3. Adjust parameters and resubmit', + '4. Contact administrator about cancellation policy' + ], + quick_fixes: [ + 'Check cancellation reason', + 'Adjust resource requirements', + 'Resubmit job' + ] + }, + 'build_compile_error' => { + title: 'Compile Error', + category: 'build_failure', + severity: 'high', + causes: [ + 'Source code syntax error or compiler version incompatible', + 'Missing compile-time dependency header files', + 'Compiler parameter configuration error' + ], + solutions: [ + '1. Check compilation error information in ${result_root}/build.log', + '2. Verify compiler version and source code compatibility', + '3. Check development package dependencies in BuildRequires' + ], + quick_fixes: [ + 'View compilation error details: cat ${result_root}/build.log | grep -A5 -B5 error', + 'Check compiler version: gcc --version', + 'Verify development packages: rpm -qa | grep -E "devel|dev"' + ] + }, + 'build_spec_error' => { + title: 'Spec File Error', + category: 'build_failure', + severity: 'high', + causes: [ + 'Spec file syntax error', + 'Macro definition error or missing', + 'File path or permission configuration error' + ], + solutions: [ + '1. Check spec file syntax correctness', + '2. Verify all macro definitions and variables', + '3. Confirm file list and permission settings' + ], + quick_fixes: [ + 'Syntax check: rpmlint ${suite}.spec', + 'Macro expansion check: rpm --eval "%{_builddir}"', + 'File list verification: rpm -qpl ${result_root}/*.rpm' + ] + }, + 'build_exit_error' => { + title: 'Build Process Abnormal Exit', + category: 'build_failure', + severity: 'high', + causes: [ + 'Build script execution failed', + 'Process terminated due to insufficient system resources', + 'Fatal error encountered during build process' + ], + solutions: [ + '1. Check specific error information in stderr', + '2. Verify system resource usage', + '3. Check build script execution logic' + ], + quick_fixes: [ + 'View error output: cat ${result_root}/stderr', + 'Check system resources: free -m && df -h', + 'Verify build script: bash -x ${result_root}/job.sh' + ] + }, + 'build_timeout_error' => { + title: 'Build Timeout', + category: 'build_failure', + severity: 'medium', + causes: [ + 'Build process takes too long and exceeds limit', + 'Build process stuck and unresponsive', + 'High system load affecting build speed' + ], + solutions: [ + '1. Check build logs to confirm stuck location', + '2. Optimize build scripts to improve efficiency', + '3. Appropriately increase timeout limit' + ], + quick_fixes: [ + 'Check last activity: tail ${result_root}/build.log', + 'View system load: uptime', + 'Check timeout configuration: grep -i timeout ${result_root}/job.yaml' + ] + }, + 'build_memory_error' => { + title: 'Memory Insufficient Error', + category: 'build_failure', + severity: 'high', + causes: [ + 'Build process consumes too much memory', + 'Insufficient system available memory', + 'Memory leak causing OOM' + ], + solutions: [ + '1. Check system memory usage', + '2. Optimize build process to reduce memory consumption', + '3. Consider using testbox with larger memory' + ], + quick_fixes: [ + 'Check memory usage: free -m', + 'View OOM logs: dmesg | grep -i "killed process"', + 'Check swap usage: swapon -s' + ] + }, + 'build_permission_error' => { + title: 'Permission Error', + category: 'build_failure', + severity: 'medium', + causes: [ + 'Insufficient file or directory permissions', + 'User permission configuration error', + 'SELinux or security policy restrictions' + ], + solutions: [ + '1. Check related file and directory permissions', + '2. Verify user permission configuration', + '3. Check SELinux policy settings' + ], + quick_fixes: [ + 'Check permissions: ls -la ${result_root}/', + 'View current user: whoami && groups', + 'Check SELinux: getenforce && sestatus' + ] + }, + 'build_unknown_error' => { + title: 'Unknown Build Error', + category: 'build_failure', + severity: 'medium', + causes: [ + 'Specific error cause requires further analysis', + 'Possibly a new type of build issue', + 'Error information insufficient or incomplete' + ], + solutions: [ + '1. Thoroughly check all build logs', + '2. Compare differences with similar successful builds', + '3. Contact technical support for in-depth analysis' + ], + quick_fixes: [ + 'Comprehensive log check: find ${result_root} -name "*.log" -exec cat {} \\;', + 'Error information search: grep -r -i error ${result_root}/', + 'Submit detailed report: include job_id and complete logs' + ] + }, + 'service_start_error' => { + title: 'Service Start Failure', + category: 'service_failure', + severity: 'high', + causes: [ + 'Service configuration file error or missing', + 'Port conflict or insufficient permissions', + 'Dependent services not started', + 'Insufficient system resources' + ], + solutions: [ + '1. Check service configuration file correctness', + '2. Verify port usage and permission settings', + '3. Confirm dependent service status', + '4. Check system resource usage' + ], + quick_fixes: [ + 'Check service status: systemctl status ${service_name}', + 'View service logs: journalctl -u ${service_name} --no-pager', + 'Check port usage: netstat -tlnp | grep ${port}', + 'Verify configuration file: ${service_name} -t' + ] + } +}.freeze diff --git a/lib/job_diagnosis_monitoring.rb b/lib/job_diagnosis_monitoring.rb new file mode 100644 index 000000000..860efe7be --- /dev/null +++ b/lib/job_diagnosis_monitoring.rb @@ -0,0 +1,308 @@ +# SPDX-License-Identifier: MulanPSL-2.0+ +# Copyright (c) 2020 Huawei Technologies Co., Ltd. All rights reserved. +# frozen_string_literal: true + +# Compass CI diagnosis - real-time monitoring and alerting module +# Provide proactive monitoring and alerting mechanisms to improve build success rate + +require_relative 'job_diagnosis_core' + +# System health monitoring and alerting +def monitor_system_health + return unless diagnosis_enabled? + + health_metrics = collect_health_metrics + warnings = analyze_health_trends(health_metrics) + + if warnings.any? + send_health_warnings(warnings) + end + + health_metrics +rescue StandardError => e + log_diagnosis_error(e, 'monitor_system_health') if defined?(log_error) + {} +end + +# Collect system health metrics +def collect_health_metrics + metrics = { + 'timestamp' => Time.now.to_i, + 'scheduler_queue_size' => get_scheduler_queue_size, + 'active_testboxes' => get_active_testbox_count, + 'failed_jobs_rate' => calculate_failed_jobs_rate, + 'average_queue_time' => calculate_average_queue_time, + 'system_load' => get_system_load, + 'available_memory' => get_available_memory + } + + metrics.compact +end + +# Get scheduler queue size +def get_scheduler_queue_size + return nil unless defined?(ETCD_CLIENT) + + begin + queues = ETCD_CLIENT.get('/queues/sched/ready', range_end: '/queues/sched/ready/zzzzzzzzzzz').to_h + queues.keys.length + rescue StandardError + nil + end +end + +# Get active testbox count +def get_active_testbox_count + return nil unless defined?(search_testboxes) + + begin + _, total = search_testboxes + total + rescue StandardError + nil + end +end + +# Calculate failed job rate +def calculate_failed_jobs_rate + return nil unless defined?(es_query) + + begin + # Query jobs from the last hour + one_hour_ago = (Time.now - 3600).strftime('%Y-%m-%dT%H:%M:%S') + + query = { + query: { + range: { + submit_time: { + gte: one_hour_ago + } + } + }, + aggs: { + by_state: { + terms: { + field: 'job_state.keyword' + } + } + }, + size: 0 + } + + result = es_query(query) + buckets = result.dig('aggregations', 'by_state', 'buckets') || [] + + total_jobs = buckets.sum { |bucket| bucket['doc_count'] } + failed_jobs = buckets.find { |bucket| bucket['key'] == 'failed' }&.[]('doc_count') || 0 + + total_jobs > 0 ? (failed_jobs.to_f / total_jobs * 100).round(2) : 0 + rescue StandardError + nil + end +end + +# Calculate average queue time +def calculate_average_queue_time + return nil unless defined?(es_query) + + begin + # Query queue time for the last 100 completed jobs + query = { + query: { + bool: { + must: [ + { exists: { field: 'boot_time' } }, + { exists: { field: 'submit_time' } } + ] + } + }, + sort: [ + { submit_time: { order: 'desc' } } + ], + size: 100, + _source: ['submit_time', 'boot_time'] + } + + result = es_query(query) + jobs = result.dig('hits', 'hits') || [] + + queue_times = jobs.filter_map do |job| + source = job['_source'] + next unless source['submit_time'] && source['boot_time'] + + submit_time = Time.parse(source['submit_time']) + boot_time = Time.parse(source['boot_time']) + boot_time - submit_time + rescue StandardError + nil + end + + queue_times.any? ? (queue_times.sum / queue_times.length / 60).round(2) : nil + rescue StandardError + nil + end +end + +# Get system load - cross-platform compatible +def get_system_load + begin + if RUBY_PLATFORM.match?(/win32|mingw|mswin/) + # Windows: use wmic to get CPU usage as load indicator + cpu_usage = `wmic cpu get loadpercentage /value`.match(/LoadPercentage=(\d+)/)[1].to_f + cpu_usage / 100.0 * 4.0 # Convert to Linux load average-like value + else + # Linux/Unix: use uptime command + load_avg = `uptime`.match(/load average: ([\d\.]+)/)[1].to_f + load_avg + end + rescue StandardError + nil + end +end + +# Get available memory - cross-platform compatible +def get_available_memory + begin + if RUBY_PLATFORM.match?(/win32|mingw|mswin/) + # Windows: use wmic to get memory information + total_mem = `wmic computersystem get TotalPhysicalMemory /value`.match(/TotalPhysicalMemory=(\d+)/)[1].to_i / 1024 / 1024 + free_mem = `wmic OS get FreePhysicalMemory /value`.match(/FreePhysicalMemory=(\d+)/)[1].to_i / 1024 + free_mem + else + # Linux/Unix: use free command + mem_info = `free -m`.lines[1].split + mem_info[6].to_i + end + rescue StandardError + nil + end +end + +# Analyze health trends and generate alerts +def analyze_health_trends(metrics) + warnings = [] + + # Check queue size alerts + if metrics['scheduler_queue_size'] && metrics['scheduler_queue_size'] > 100 + warnings << { + type: 'high_queue_size', + severity: 'medium', + message: "Scheduler queue too long: #{metrics['scheduler_queue_size']} jobs waiting", + recommendations: [ + 'Check available testbox count', + 'Consider increasing testbox resources', + 'Analyze queue backlog causes' + ] + } + end + + # Check failure rate alerts + if metrics['failed_jobs_rate'] && metrics['failed_jobs_rate'] > 20 + warnings << { + type: 'high_failure_rate', + severity: 'high', + message: "Job failure rate too high: #{metrics['failed_jobs_rate']}%", + recommendations: [ + 'Check system component health status', + 'Analyze common causes of recent failed jobs', + 'Consider pausing new job submissions' + ] + } + end + + # Check average queue time alerts + if metrics['average_queue_time'] && metrics['average_queue_time'] > 60 + warnings << { + type: 'long_queue_time', + severity: 'medium', + message: "Average queue time too long: #{metrics['average_queue_time']} minutes", + recommendations: [ + 'Increase available testbox count', + 'Optimize scheduling strategy', + 'Check resource allocation configuration' + ] + } + end + + # Check system load alerts + if metrics['system_load'] && metrics['system_load'] > 4.0 + warnings << { + type: 'high_system_load', + severity: 'high', + message: "System load too high: #{metrics['system_load']}", + recommendations: [ + 'Check system resource usage', + 'Consider limiting concurrent job count', + 'Optimize system performance configuration' + ] + } + end + + # Check memory alerts + if metrics['available_memory'] && metrics['available_memory'] < 200 + warnings << { + type: 'low_memory', + severity: 'high', + message: "Insufficient available memory: #{metrics['available_memory']} MB", + recommendations: [ + 'Release unnecessary memory usage', + 'Restart high memory consuming services', + 'Consider upgrading system memory' + ] + } + end + + warnings +end + +# Send health alerts +def send_health_warnings(warnings) + return unless warnings.any? + + # Log alerts to system log + warnings.each do |warning| + log_warning = { + 'component' => 'system_health_monitor', + 'type' => warning[:type], + 'severity' => warning[:severity], + 'message' => warning[:message], + 'recommendations' => warning[:recommendations], + 'timestamp' => Time.now.to_s + } + + if defined?(log_error) + log_error(log_warning) + else + warn "[HEALTH WARNING] #{warning[:message]}" + end + end + + # Can be extended to send email, SMS and other notification methods + # send_email_notification(warnings) if defined?(send_email_notification) + # send_slack_notification(warnings) if defined?(send_slack_notification) +end + +# Get monitoring status summary +def get_monitoring_status + return [503, standard_headers, { 'error' => 'Monitoring disabled' }.to_json] unless diagnosis_enabled? + + begin + current_metrics = collect_health_metrics + warnings = analyze_health_trends(current_metrics) + + status = { + 'monitoring_enabled' => true, + 'last_check' => Time.now.to_s, + 'current_metrics' => current_metrics, + 'active_warnings' => warnings, + 'system_status' => warnings.any? ? 'warning' : 'healthy' + } + + [200, standard_headers, status.to_json] + rescue StandardError => e + log_diagnosis_error(e, 'get_monitoring_status') if defined?(log_error) + [500, standard_headers, { 'error' => 'Monitoring check failed' }.to_json] + end +end + +# Standard HTTP headers already defined in job_diagnosis_api.rb, no need to duplicate diff --git a/src/lib/web_backend.rb b/src/lib/web_backend.rb index cd28e78bd..1dcf7bb49 100644 --- a/src/lib/web_backend.rb +++ b/src/lib/web_backend.rb @@ -26,6 +26,20 @@ require_relative './api_input_check.rb' require_relative '../../lib/json_logger.rb' require_relative './jwt.rb' +# Compass CI diagnosis feature integration - complete module loading +if ENV['COMPASS_DIAGNOSIS_ENABLED'] == 'true' + begin + require_relative '../../lib/job_diagnosis_core.rb' + require_relative '../../lib/job_diagnosis_api.rb' + require_relative '../../lib/job_diagnosis_monitoring.rb' + puts "[COMPASS-CI] Diagnosis modules loaded successfully" if ENV['DEBUG'] + rescue LoadError => e + warn "[COMPASS-CI] Diagnosis modules not found: #{e.message}" + rescue StandardError => e + warn "[COMPASS-CI] Diagnosis load failed: #{e.message}" + end +end + UPSTREAM_REPOS_PATH = ENV['UPSTREAM_REPOS_PATH'] || '/c/upstream-repos' FIELDS = %w[ @@ -322,9 +336,109 @@ def get_jobs_result(result) end jobs << job end + + # Diagnosis feature enhancement (following project functional style) + jobs = diagnose_jobs(jobs) if defined?(diagnose_jobs) jobs end +# Diagnosis feature API interfaces (following project functional style) +def diagnosis_status + return get_diagnosis_status if defined?(get_diagnosis_status) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +def diagnosis_error_types + return get_diagnosis_error_types if defined?(get_diagnosis_error_types) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +def diagnosis_error_guidance(params) + return get_diagnosis_error_guidance(params[:error_type]) if defined?(get_diagnosis_error_guidance) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +def diagnosis_health_check + return perform_environment_health_check if defined?(perform_environment_health_check) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +# Diagnose specific job +def diagnosis_diagnose_job(job_id) + return diagnose_job_by_id(job_id) if defined?(diagnose_job_by_id) + { 'error' => 'Diagnosis not available' } +end + +def diagnosis_start_wizard(params) + return start_diagnosis_wizard(params[:error_type], params[:difficulty_level]) if defined?(start_diagnosis_wizard) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +# Process wizard steps +def diagnosis_wizard_step(params) + return process_wizard_step(params[:wizard_id], params[:step_number], params[:user_response]) if defined?(process_wizard_step) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Diagnosis not available' }.to_json] +end + +# Get monitoring status +def diagnosis_monitoring_status + return get_monitoring_status if defined?(get_monitoring_status) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Monitoring not available' }.to_json] +end + +# Get system health metrics +def diagnosis_system_health + return monitor_system_health if defined?(monitor_system_health) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'System health monitoring not available' }.to_json] +end + +# Trigger health check +def diagnosis_health_metrics + return collect_health_metrics if defined?(collect_health_metrics) + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Health metrics collection not available' }.to_json] +end + +# Generate diagnosis report +def diagnosis_generate_report(params) + return [400, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Missing job_id' }.to_json] unless params[:job_id] + + if defined?(generate_diagnosis_report) && defined?(diagnose_job) + # Get job data + job_query = { query: { term: { '_id' => params[:job_id] } }, size: 1 } + result = es_query(job_query)['hits']['hits'] + return [404, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Job not found' }.to_json] if result.empty? + + job_source = result.first + job_data = get_job_info(job_source) + + # Perform diagnosis analysis + diagnosed_job = diagnose_job(job_data) + diagnosis_data = diagnosed_job['diagnosis'] + + if diagnosis_data + report_text = generate_diagnosis_report(job_data, diagnosis_data) + return [200, headers.merge('Access-Control-Allow-Origin' => '*'), { + 'report' => report_text, + 'job_id' => params[:job_id], + 'generated_at' => Time.now.to_s + }.to_json] + else + return [400, headers.merge('Access-Control-Allow-Origin' => '*'), { + 'error' => 'No diagnosis data available for this job', + 'job_id' => params[:job_id] + }.to_json] + end + end + + [503, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Report generation not available' }.to_json] +rescue StandardError => e + log_error({ + 'message' => e.message, + 'error_message' => "diagnosis_generate_report error, input: #{params}" + }) + [500, headers.merge('Access-Control-Allow-Origin' => '*'), { 'error' => 'Report generation failed' }.to_json] +end + def get_job_query_range(condition_fields) range = { start_time: {} } start_date = condition_fields[:start_date] @@ -401,13 +515,18 @@ def get_jobs_body(params) jobs, total = search_job(params, page_size, page_num) jobs, branches = get_optimize_jobs_branches(jobs) - { + + response = { total: total, filter: params, banner: get_banner(params[:upstream_repo], branches), jobs: jobs, fields: FIELDS - }.to_json + } + + # Diagnosis feature enhancement (following project functional style) + response = enhance_response_with_diagnosis(response) if defined?(enhance_response_with_diagnosis) + response.to_json end def get_jobs(params) @@ -426,6 +545,51 @@ def get_jobs(params) [200, headers.merge('Access-Control-Allow-Origin' => '*'), body] end +# Get detailed information for a single job (including complete diagnosis information) +def get_single_job_info(params) + payload = auth(params) + params[:my_account] = payload['my_account'] if payload and payload['my_account'] + + begin + job_id = params[:job_id] + return [400, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Missing job_id parameter'}.to_json] unless job_id + + # Query single job + query = { + query: { term: { '_id' => job_id } }, + size: 1 + } + + result = es_query(query)['hits']['hits'] + return [404, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Job not found'}.to_json] if result.empty? + + job_source = result.first + job_info = get_job_info(job_source) + + # Add additional detailed information + job_info['_source_metadata'] = { + 'index' => job_source['_index'], + 'type' => job_source['_type'], + 'score' => job_source['_score'] + } + + # Diagnosis feature enhancement (following project functional style) + job_info = diagnose_job(job_info) if defined?(diagnose_job) + + [200, headers.merge('Access-Control-Allow-Origin' => '*'), job_info.to_json] + + rescue StandardError => e + log_error({ + 'message' => e.message, + 'error_message' => "get_single_job_info error, input: #{params}" + }) + return [500, headers.merge('Access-Control-Allow-Origin' => '*'), + {'error' => 'Failed to get job info', 'message' => e.message}.to_json] + end +end + def get_repo_url(urls) return unless urls.is_a?(Array) @@ -757,13 +921,7 @@ def group_jobs_stats(params) [200, headers.merge('Access-Control-Allow-Origin' => '*'), body] end -# ------------------------------------------------------------------------------------------- -# job error table like: -# job_id error_id error_message result_root -# ------------------------------------------------------------------------------------- -# crystal.630608 "stderr.xxx" "messag:xxxx" $result_root -# ... -# ------------------------------------------------------------------------------------------- + def get_job_error(params) payload = auth(params) @@ -1391,7 +1549,7 @@ def get_job_info(job) job_info['install_job_health'] = 'success' end - when /install-rpm\.(.*)_(?:cmd|service)_(.*)\.(.*)/ + when /install-rpm\.(.*)_(?:cmd|service)_(.*)\.(.*)/ if $3 == 'fail' job_info['install_job_health'] = 'fail' return job_info @@ -1400,6 +1558,9 @@ def get_job_info(job) end end end + + # Diagnosis feature enhancement (following project functional style) + job_info = diagnose_job(job_info) if defined?(diagnose_job) job_info end -- Gitee