diff --git a/README.md b/README.md index c38b05e9ecc8746d641ad919be2592da8cbbf72c..442afac5b984ab83557a94207b3fe8937243cd0b 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,10 @@ 如果没有公网IP,均使用内网IP即可,实际部署时可以替换成公网IP ```bash + # 通过环境变量控制需要使能的服务(以下是一个常用默认搭配) + export DEPLOY_SERVER_LIST=sysom_api,sysom_diagnosis,sysom_channel,sysom_monitor_server,sysom_log,sysom_vmcore,sysom_migration,sysom_cluster_health,sysom_alarm,sysom_cmg,sysom_vul + + # 使用部署脚本部署 ./deploy.sh /usr/local/sysom 172.22.3.238 172.22.3.238 ``` @@ -224,12 +228,24 @@ Oct 10 12:58:51 mfeng bash[3217754]: + sed -i 's/^FIRST_INIT_DONE=0/FIRST_INIT_DONE=1/g' /usr/local/sysom/init_scripts/server/init.sh ``` -## 3. 通过 WEB 前端访问 +## 3. 容器化部署 + +### 3.1 基础服务部署 + + ```bash + docker build -t sysom-base -f docker/sysom_base_dockerfile . + docker run -idt --privileged --name sysom-base sysom-base + ``` + + 可以通过修改 docker/sysom_base_dockerfile 使能不同的微服务 + + +## 4. 通过 WEB 前端访问 部署成功之后,可以通过访问部署时指定的公网/私网地址访问 SysOM前端,比如 http://172.22.3.238 - 默认的用户名密码:admin/sysom@123 -- SysOM提供了 Demo 体验网站,可以访问:http://sysom.openanolis.cn/ +- SysOM提供了 Demo 体验网站,可以访问:http://sysom.openanolis.cn/,用户名/密码为:demo/sysom@openanolis123 # 其它问题见FAQ diff --git a/conf/config.yml b/conf/config.yml index c3e65bc4e54b82ff331d34b13127c1afa51d8fed..79d9ebb9be345a65c6dbede4bbd3464e9e333486 100644 --- a/conf/config.yml +++ b/conf/config.yml @@ -18,6 +18,9 @@ sysom_server: port: 6379 username: password: + kafka: + host: localhost + port: 9092 mysql: dialect: mariadb engine: pymysql diff --git a/deps/2_nginx/sysom.conf b/deps/2_nginx/sysom.conf index c2b27cc6a41678ae94b9e34dffbeefff4600dcd2..8f2d30eb22f1f9633eaac636671cbc1c0e6ded9c 100644 --- a/deps/2_nginx/sysom.conf +++ b/deps/2_nginx/sysom.conf @@ -163,6 +163,23 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; } + location /api/v1/alert_pusher/ { + proxy_pass http://127.0.0.1:7018; + proxy_read_timeout 180; + proxy_redirect off; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + + # 7010 reversed for dingtalk + # 7020 resvered for cluster_health + # 7022 resvered for colocation + location /api/v1/cmg/ { + proxy_pass http://127.0.0.1:7023; + proxy_read_timeout 180; + proxy_redirect off; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + location /api/ { proxy_pass http://127.0.0.1:7001; proxy_read_timeout 180s; diff --git a/deps/3_prometheus/init.sh b/deps/3_prometheus/init.sh index 9082ee8c00c83242f0ace63c369ebdb80ed17db8..ed8316fa3ce4822d62a8a0375c62411a3f6dbd3a 100644 --- a/deps/3_prometheus/init.sh +++ b/deps/3_prometheus/init.sh @@ -18,6 +18,15 @@ add_auto_discovery() metrics_path: "/api/v1/channel/cec_status/metrics" static_configs: - targets: ["localhost:7003"] + - job_name: "cluster_health" + metrics_path: "/metrics" + static_configs: + - targets: ["localhost:7020"] + - job_name: "colocation" + metrics_path: "/metrics" + static_configs: + - targets: ["localhost:7022"] + EOF popd diff --git a/deps/4_grafana/grafana_api_set.sh b/deps/4_grafana/grafana_api_set.sh index cd9efef77639d9ce085d254c878d8779ebfd3656..c3a930832e20eb28af36b7617811974eea21605f 100755 --- a/deps/4_grafana/grafana_api_set.sh +++ b/deps/4_grafana/grafana_api_set.sh @@ -156,6 +156,24 @@ then exit 1 fi +curl -c cookie -b cookie --location --request POST 'http://127.0.0.1:3000/api/dashboards/db' \ +--header 'Content-Type: application/json' \ +-d @"sysom-appobserver-nginx-dashboard.json" +if [ $? -ne 0 ] +then + echo "grafana configure sysom-appobserver-nginx-dashboard error" + exit 1 +fi + +curl -c cookie -b cookie --location --request POST 'http://127.0.0.1:3000/api/dashboards/db' \ +--header 'Content-Type: application/json' \ +-d @"sysom-appobserver-nginx-events-dashboard.json" +if [ $? -ne 0 ] +then + echo "grafana configure sysom-appobserver-nginx-events-dashboard error" + exit 1 +fi + curl -c cookie -b cookie --location --request POST 'http://127.0.0.1:3000/api/dashboards/db' \ --header 'Content-Type: application/json' \ -d @"sysom-appobserver-process-dashboard.json" diff --git a/deps/4_grafana/sysom-appobserver-mysql-dashboard.json b/deps/4_grafana/sysom-appobserver-mysql-dashboard.json index 629f49138fea28f120de93c8686b6e2733faf9b3..c8d44d729bac793ade74e13d9fc050dba33d9316 100644 --- a/deps/4_grafana/sysom-appobserver-mysql-dashboard.json +++ b/deps/4_grafana/sysom-appobserver-mysql-dashboard.json @@ -26,270 +26,22 @@ "links": [], "liveNow": false, "panels": [ - { - "datasource": "sysom-prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 71, - "options": { - "legend": { - "calcs": [], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"curriops\"}", - "legendFormat": "{{disk}}.iops", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"iopsBaseThresh\"}", - "hide": false, - "legendFormat": "{{disk}}.iopsBaseThresh", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"iopsComThresh\"}", - "hide": false, - "legendFormat": "{{disk}}.iopsComThresh", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"iopsMoveAvg\"}", - "hide": false, - "legendFormat": "{{disk}}.iopsMoveAvg", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"iopsThresh\"}", - "hide": false, - "legendFormat": "{{disk}}.iopsThresh", - "range": true, - "refId": "E" - } - ], - "title": "iops异常检测追踪", - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 72, - "options": { - "legend": { - "calcs": [], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"currbps\"}", - "legendFormat": "{{disk}}.bps", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"bpsThresh\"}", - "hide": false, - "legendFormat": "{{disk}}.bpsThresh", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"bpsMoveAvg\"}", - "hide": false, - "legendFormat": "{{disk}}.bpsMoveAvg", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"bpsComThresh\"}", - "hide": true, - "legendFormat": "{{disk}}.bpsComThresh", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_debugDynThreshTlb{exported_instance=\"$hostIP\",value=\"bpsBaseThresh\"}", - "hide": true, - "legendFormat": "{{disk}}.bpsBaseThresh", - "range": true, - "refId": "E" - } - ], - "title": "iops异常检测追踪", - "type": "timeseries" - }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 8 + "y": 0 }, "id": 69, "panels": [], - "title": "Mysql alarm statics", + "title": "MySQL\u5f02\u5e38\u544a\u8b66\u7c7b\u578b\u7edf\u8ba1", "type": "row" }, { "datasource": "sysom-prometheus", + "description": "mysql\u670d\u52a1\u5728OS\u4e2d\u88ab\u89c2\u6d4b\u5230\u53ef\u80fd\u5b58\u5728\u7684\u5f02\u5e38\u544a\u8b66\u4e8b\u4ef6\u7edf\u8ba1\u4fe1\u606f", "fieldConfig": { "defaults": { "color": { @@ -403,7 +155,7 @@ "h": 5, "w": 24, "x": 0, - "y": 9 + "y": 1 }, "id": 64, "links": [], @@ -427,8 +179,9 @@ "datasource": "sysom-prometheus", "editorMode": "code", "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_Mysql_Error_Type\"}", + "hide": false, "interval": "35", - "legendFormat": "Mysql Error Alarm", + "legendFormat": "MySQL Error Alarm", "range": true, "refId": "A" }, @@ -438,7 +191,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_Mysql_Slow_Sql_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql Slow_Sql Alarm", + "legendFormat": "MySQL Slow_Sql Alarm", "range": true, "refId": "B" }, @@ -448,7 +201,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_Net_Drops_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql Net_Drops Alarm", + "legendFormat": "MySQL Net_Drops Alarm", "range": true, "refId": "C" }, @@ -458,7 +211,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_OOM_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql OOM Alarm", + "legendFormat": "MySQL OOM Alarm", "range": true, "refId": "D" }, @@ -468,7 +221,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_RT_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql RT Alarm", + "legendFormat": "MySQL RT Alarm", "range": true, "refId": "E" }, @@ -478,7 +231,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Process_Sched_Delay_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql Sched_Delay Alarm", + "legendFormat": "MySQL Sched_Delay Alarm", "range": true, "refId": "F" }, @@ -488,7 +241,7 @@ "expr": "sysom_obser_mysqld_alarm{containerId=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"Alarm_Long_Time_D_Type\"}", "hide": false, "interval": "35", - "legendFormat": "Mysql Long_Time_D Alarm", + "legendFormat": "MySQL Long_Time_D Alarm", "range": true, "refId": "G" }, @@ -582,7 +335,7 @@ "refId": "P" } ], - "title": "异常告警分布(次数)", + "title": "\u5f02\u5e38\u544a\u8b66\u5206\u5e03\uff08\u6b21\u6570\uff09", "type": "bargauge" }, { @@ -591,11 +344,11 @@ "h": 1, "w": 24, "x": 0, - "y": 14 + "y": 6 }, "id": 26, "panels": [], - "title": "Mysql resource usage", + "title": "MySQL\u8d44\u6e90\u4f7f\u7528\u8be6\u60c5", "type": "row" }, { @@ -622,7 +375,7 @@ { "matcher": { "id": "byName", - "options": "缓存线程数" + "options": "\u7f13\u5b58\u7ebf\u7a0b\u6570" }, "properties": [ { @@ -642,7 +395,7 @@ { "matcher": { "id": "byName", - "options": "创建线程数" + "options": "\u521b\u5efa\u7ebf\u7a0b\u6570" }, "properties": [ { @@ -666,7 +419,7 @@ { "matcher": { "id": "byName", - "options": "运行线程数" + "options": "\u8fd0\u884c\u7ebf\u7a0b\u6570" }, "properties": [ { @@ -690,7 +443,7 @@ { "matcher": { "id": "byName", - "options": "连接线程数" + "options": "\u8fde\u63a5\u7ebf\u7a0b\u6570" }, "properties": [ { @@ -717,7 +470,7 @@ "h": 5, "w": 24, "x": 0, - "y": 15 + "y": 7 }, "id": 38, "options": { @@ -741,7 +494,7 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"threadCached\"}", "hide": false, "interval": "35", - "legendFormat": "缓存线程数", + "legendFormat": "\u7f13\u5b58\u7ebf\u7a0b\u6570", "range": true, "refId": "A" }, @@ -751,7 +504,7 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"threadCreated\"}", "hide": false, "interval": "35", - "legendFormat": "创建线程总数", + "legendFormat": "\u521b\u5efa\u7ebf\u7a0b\u603b\u6570", "range": true, "refId": "B" }, @@ -761,7 +514,7 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"threadRunning\"}", "hide": false, "interval": "35", - "legendFormat": "运行线程数", + "legendFormat": "\u8fd0\u884c\u7ebf\u7a0b\u6570", "range": true, "refId": "C" }, @@ -771,7 +524,7 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"threadConnected\"}", "hide": false, "interval": "35", - "legendFormat": "连接线程数", + "legendFormat": "\u8fde\u63a5\u7ebf\u7a0b\u6570", "range": true, "refId": "E" }, @@ -781,12 +534,12 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"maxConnection\"}", "hide": false, "interval": "35", - "legendFormat": "最大连接限制", + "legendFormat": "\u6700\u5927\u8fde\u63a5\u9650\u5236", "range": true, "refId": "D" } ], - "title": "mysql 连接线程池使用", + "title": "mySQL \u8fde\u63a5\u7ebf\u7a0b\u6c60\u4f7f\u7528", "type": "gauge" }, { @@ -850,7 +603,7 @@ "h": 8, "w": 8, "x": 0, - "y": 20 + "y": 12 }, "id": 17, "options": { @@ -904,12 +657,12 @@ "refId": "C" } ], - "title": "Mysql CPU占用率", + "title": "MySQL CPU\u5360\u7528\u7387", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "\u4e00\u822c\u5730\u5360\u6bd4\u8d8a\u4f4e\u8d8a\u597d\uff0c\u5360\u6bd4\u8d8a\u9ad8\uff0c\u8bf4\u660e\u7cfb\u7edf\u5206\u914d\u7ed9mysql\u7684CPU\u65f6\u95f4\u7247\u7ecf\u5e38\u5904\u4e8e\u672a\u4f7f\u7528\u5b8c\u5c31\u88ab\u5207\u6362\u51fa\u53bb\u7684\u72b6\u6001", "fieldConfig": { "defaults": { "color": { @@ -968,7 +721,7 @@ "h": 8, "w": 8, "x": 8, - "y": 20 + "y": 12 }, "id": 18, "options": { @@ -996,12 +749,12 @@ "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"cpuGiveup\"}", "hide": false, "interval": "35", - "legendFormat": "让出率", + "legendFormat": "\u8ba9\u51fa\u7387", "range": true, "refId": "A" } ], - "title": "Mysql CPU让出率", + "title": "MySQL CPU\u8ba9\u51fa\u7387", "type": "timeseries" }, { @@ -1061,7 +814,7 @@ "h": 8, "w": 8, "x": 16, - "y": 20 + "y": 12 }, "id": 42, "links": [], @@ -1127,13 +880,13 @@ "hide": false, "interval": "35", "intervalFactor": 1, - "legendFormat": "长事务数", + "legendFormat": "\u957f\u4e8b\u52a1\u6570", "range": true, "refId": "D", "step": 240 } ], - "title": "Mysql Undolog链表长度&长事务", + "title": "MySQL Undolog\u94fe\u8868\u957f\u5ea6&\u957f\u4e8b\u52a1", "type": "timeseries" }, { @@ -1162,7 +915,7 @@ "h": 7, "w": 8, "x": 0, - "y": 28 + "y": 20 }, "id": 34, "options": { @@ -1198,7 +951,7 @@ "editorMode": "code", "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"bufferPoolFree\"}", "interval": "35", - "legendFormat": "空闲", + "legendFormat": "\u7a7a\u95f2", "range": true, "refId": "A" }, @@ -1208,12 +961,12 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"bufferPoolTotal\"}-on(podID)sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"bufferPoolFree\"}", "hide": false, "interval": "35", - "legendFormat": "已用", + "legendFormat": "\u5df2\u7528", "range": true, "refId": "B" } ], - "title": "Mysql 内存缓存池使用", + "title": "MySQL \u5185\u5b58\u7f13\u5b58\u6c60\u4f7f\u7528", "type": "piechart" }, { @@ -1240,7 +993,7 @@ "h": 7, "w": 8, "x": 8, - "y": 28 + "y": 20 }, "id": 62, "options": { @@ -1302,7 +1055,7 @@ "refId": "C" } ], - "title": "Mysql OS内存使用分布", + "title": "MySQL OS\u5185\u5b58\u4f7f\u7528\u5206\u5e03", "transformations": [], "type": "piechart" }, @@ -1360,7 +1113,7 @@ { "matcher": { "id": "byName", - "options": "空闲" + "options": "\u7a7a\u95f2" }, "properties": [ { @@ -1375,7 +1128,7 @@ { "matcher": { "id": "byName", - "options": "已用" + "options": "\u5df2\u7528" }, "properties": [ { @@ -1393,7 +1146,7 @@ "h": 7, "w": 8, "x": 16, - "y": 28 + "y": 20 }, "id": 36, "options": { @@ -1431,7 +1184,7 @@ "editorMode": "code", "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"chkPointUsage\"}", "interval": "35", - "legendFormat": "已用", + "legendFormat": "\u5df2\u7528", "range": true, "refId": "A" }, @@ -1441,12 +1194,12 @@ "expr": "sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"redologCapacity\"}-on(containerID)sysom_obser_mysqld_innodb{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"chkPointUsage\"}", "hide": false, "interval": "35", - "legendFormat": "空闲", + "legendFormat": "\u7a7a\u95f2", "range": true, "refId": "B" } ], - "title": "Mysql Redolog使用量", + "title": "MySQL Redolog\u4f7f\u7528\u91cf", "type": "piechart" }, { @@ -1455,16 +1208,16 @@ "h": 1, "w": 24, "x": 0, - "y": 35 + "y": 27 }, "id": 54, "panels": [], - "title": "Mysql latency details", + "title": "MySQL RT\u5ef6\u8fdf\u8be6\u60c5", "type": "row" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "mysql\u670d\u52a1\u4fa7\u63a5\u53d7\u5230\u4e00\u4e2a\u8bf7\u6c42\uff0c\u5230\u5904\u7406\u5b8c\u4e4b\u540e\uff0c\u56de\u5e94\u5ba2\u6237\u7aef\u8fd9\u4e00\u8fc7\u7a0b\u7684\u603b\u65f6\u95f4\u6d88\u8017\u3002\u8fd9\u91cc\u5c55\u793a\u7684\u662f\u5e73\u5747RT\u503c", "fieldConfig": { "defaults": { "color": { @@ -1509,11 +1262,12 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, - "unit": "µs" + "unit": "\u00b5s" }, "overrides": [] }, @@ -1521,7 +1275,7 @@ "h": 9, "w": 8, "x": 0, - "y": 36 + "y": 28 }, "id": 46, "options": { @@ -1575,11 +1329,12 @@ "refId": "C" } ], - "title": "Mysql RT", + "title": "MySQL RT", "type": "timeseries" }, { "datasource": "sysom-prometheus", + "description": "\u5bf9\u5e94\u4e0d\u540c\u8fde\u63a5\u7684\u8bf7\u6c42\u8be6\u60c5\uff0c\u53f3\u4fa7\u7684\u201cRT\u5206\u6790\u201d\u53ef\u4ee5\u8df3\u8f6c\u5230RT\u5ef6\u8fdf\u8bca\u65ad\uff0c\u8fdb\u4e00\u6b65\u5c06RT\u5ef6\u8fdf\u7ec6\u7c92\u5ea6\u5c55\u5f00", "fieldConfig": { "defaults": { "color": { @@ -1595,7 +1350,8 @@ "mode": "absolute", "steps": [ { - "color": "semi-dark-red" + "color": "semi-dark-red", + "value": null } ] } @@ -1740,7 +1496,7 @@ "value": [ { "targetBlank": true, - "title": "RT根因分析", + "title": "RT\u6839\u56e0\u5206\u6790", "url": "../diagnose/link/rtdelay?instance=${__data.fields.exported_instance}&pid=${__data.fields.Pid}&time=10" } ] @@ -1756,7 +1512,7 @@ "options": { "true": { "index": 0, - "text": "RT分析" + "text": "RT\u5206\u6790" } }, "type": "value" @@ -1813,7 +1569,7 @@ "properties": [ { "id": "unit", - "value": "µs" + "value": "\u00b5s" }, { "id": "thresholds", @@ -1821,7 +1577,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1844,7 +1601,7 @@ "properties": [ { "id": "unit", - "value": "µs" + "value": "\u00b5s" }, { "id": "thresholds", @@ -1852,7 +1609,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1921,7 +1679,7 @@ "h": 9, "w": 16, "x": 8, - "y": 36 + "y": 28 }, "id": 50, "options": { @@ -1952,7 +1710,7 @@ "refId": "B" } ], - "title": "Mysql请求详情", + "title": "MySQL\u8bf7\u6c42\u8be6\u60c5", "transformations": [ { "id": "joinByField", @@ -2286,7 +2044,7 @@ }, { "datasource": "sysom-prometheus", - "description": "", + "description": "mysql\u7b49\u5f85\u7cfb\u7edfIO\u8d44\u6e90\u65f6\u6240\u6d88\u8017\u65f6\u95f4\uff0c\u4e00\u822c\u5730\uff0c\u5728\u6709IO\u60c5\u51b5\u4e0b\uff0c\u8be5\u6307\u6807\u5e94\u8d8a\u4f4e\u8d8a\u597d\uff0c\u8bf4\u660e\u7cfb\u7edfIO\u8d44\u6e90\u53ef\u4ee5\u65e0\u963b\u788d\u5730\u83b7\u53d6\uff0c\u5982\u51fa\u73b0\u98d9\u5347\uff0c\u53ef\u80fd\u5f15\u8d77\u670d\u52a1\u8fdb\u7a0b\u963b\u585e", "fieldConfig": { "defaults": { "color": { @@ -2328,11 +2086,12 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, - "unit": "µs" + "unit": "\u00b5s" }, "overrides": [] }, @@ -2340,7 +2099,7 @@ "h": 9, "w": 8, "x": 0, - "y": 45 + "y": 37 }, "id": 22, "options": { @@ -2373,12 +2132,12 @@ "refId": "A" } ], - "title": "Mysql 等待IO资源延迟(平均每秒)", + "title": "MySQL \u7b49\u5f85IO\u8d44\u6e90\u5ef6\u8fdf(\u5e73\u5747\u6bcf\u79d2)", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "mysql\u8bbf\u95ee\u5185\u5b58\u65f6\u6240\u6d88\u8017\u65f6\u95f4\uff0c\u4e00\u822c\u5730\uff0c\u8be5\u6307\u6807\u5e94\u63a5\u8fd1\u4e8e0\uff0c\u8bf4\u660e\u7cfb\u7edf\u5185\u5b58\u8d44\u6e90\u53ef\u4ee5\u65e0\u963b\u788d\u5730\u83b7\u53d6\uff0c\u5982\u51fa\u73b0\u98d9\u5347\uff0c\u53ef\u80fd\u5f15\u8d77\u670d\u52a1\u8fdb\u7a0b\u963b\u585e", "fieldConfig": { "defaults": { "color": { @@ -2420,7 +2179,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -2432,7 +2192,7 @@ "h": 9, "w": 8, "x": 8, - "y": 45 + "y": 37 }, "id": 60, "options": { @@ -2465,12 +2225,12 @@ "refId": "A" } ], - "title": "Mysql申请OS内存延迟", + "title": "MySQL\u7533\u8bf7OS\u5185\u5b58\u5ef6\u8fdf", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "在就绪队列中等待的时间", + "description": "mysql\u5728\u5c31\u7eea\u961f\u5217\u4e2d\u7b49\u5f85\u5206\u914dCPU\u8d44\u6e90\uff0c\u88ab\u8c03\u5ea6\u7684\u65f6\u95f4\uff0c\u8be5\u6307\u6807\u6700\u597d\u662f\u63a5\u8fd1\u4e8e0\uff0c\u5426\u5219\uff0c\u8bf4\u660e\u5f53\u524dmysql\u5b9e\u4f8b\u5185CPU\u8d44\u6e90\u7d27\u5f20", "fieldConfig": { "defaults": { "color": { @@ -2512,7 +2272,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2528,7 +2289,7 @@ "h": 9, "w": 8, "x": 16, - "y": 45 + "y": 37 }, "id": 28, "options": { @@ -2555,30 +2316,592 @@ "editorMode": "code", "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"schedDelay\"}", "interval": "35", - "legendFormat": "调度延迟", + "legendFormat": "\u8c03\u5ea6\u5ef6\u8fdf", "range": true, "refId": "A" } ], - "title": "Mysql OS调度延迟", + "title": "MySQL OS\u8c03\u5ea6\u5ef6\u8fdf", "type": "timeseries" }, { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, + "datasource": "sysom-prometheus", + "description": "\u5c06mysql\u7684\u6570\u636eIO\u603b\u6d88\u8017\u5ef6\u8fdf\uff0c\u53ef\u4ee5\u8fdb\u4e00\u6b65\u7ec6\u5206\u5728OS\uff08\u5305\u542b\u5185\u6838block\u5b50\u7cfb\u7edf\u3001\u786c\u76d8\u9a71\u52a8\u3001IO\u6536\u5272\u3001\u5b8c\u6210IO\u65f6\u5524\u9192\u524d\u53f0\u5e94\u7528\u8fdb\u7a0b\u7b49\u5404\u9636\u6bb5\u6240\u6d88\u8017\u5ef6\u8fdf\uff09\u3001\u786c\u76d8\u5ef6\u8fdf\u6d88\u8017", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 46 + }, + "id": 71, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"total_delay\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.Total_delay", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"disk\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.Disk_delay", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"total_delay\"}) - avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"disk\"})", + "hide": true, + "interval": "35", + "legendFormat": "MySQL.OS_delay", + "range": true, + "refId": "G" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"block\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.OS_delay_by_io_block", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"driver\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.OS_delay_by_disk_driver", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"complete\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.OS_delay_by_io_complete", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg(sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"done\"})", + "hide": false, + "interval": "35", + "legendFormat": "MySQL.OS_delay_by_io_done", + "range": true, + "refId": "F" + } + ], + "title": "MySQL\u6570\u636eIO\u5904\u7406\u5ef6\u8fdf\u5206\u5e03", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5c06mysql\u8fdb\u884cIO\u7684\u78c1\u76d8\u89c6\u89d2\u6765\u770b\uff0c\u53ef\u4ee5\u8fdb\u4e00\u6b65\u67e5\u770b\u786c\u76d8\u6bcf\u786c\u4ef6\u961f\u5217\uff0c\u5904\u7406IO\u7684\u5ef6\u8fdf\u6d88\u8017\uff0c\u5e76\u7ec6\u5206\u5728OS\u3001\u786c\u76d8\u5ef6\u8fdf\u6d88\u8017", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 46 + }, + "id": 78, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname, queue_id) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"total_delay\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.Qid{{queue_id}}.Total_delay", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname, queue_id) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"disk\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.Qid{{queue_id}}.Disk_delay", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname, queue_id) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"total_delay\"}) - avg by(diskname, queue_id) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"disk\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.Qid{{queue_id}}.OS_delay", + "range": true, + "refId": "C" + } + ], + "title": "MySQL \u78c1\u76d8\u5404\u961f\u5217\u7ea7IO\u5ef6\u8fdf\u5206\u5e03", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5c06mysql\u8fdb\u884cIO\u7684\u78c1\u76d8\u89c6\u89d2\u6765\u770b\uff0c\u53ef\u4ee5\u8fdb\u4e00\u6b65\u67e5\u770bIO\u7684\u5ef6\u8fdf\u6d88\u8017\uff0c\u5e76\u7ec6\u5206\u5728OS\u3001\u786c\u76d8\u5ef6\u8fdf\u6d88\u8017", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 46 + }, + "id": 79, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"total_delay\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.Total_delay", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"disk\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.Disk_delay", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "avg by(diskname) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"total_delay\"}) - avg by(diskname) (sysom_iolatency{diskname=~\"$Disk_list\", exported_instance=\"$hostIP\", value=\"disk\"})", + "hide": false, + "interval": "35", + "legendFormat": "{{diskname}}.OS_delay", + "range": true, + "refId": "C" + } + ], + "title": "MySQL \u78c1\u76d8\u7ea7IO\u5ef6\u8fdf\u5206\u5e03", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "mysql\u4ea7\u751f\u6570\u636eIO\uff0c\u4f4d\u4e8e\u54ea\u4e9bCPU\u4e0a\uff0c\u989c\u8272\u8d8a\u6df1\uff0c\u8bf4\u660e\u5728\u6b64CPU\u7f16\u53f7\u8303\u56f4\u5185\u53d1\u8d77\u7684IO\u8bf7\u6c42\u8d8a\u591a\uff08Bucket\u4e3aCPU\u7f16\u53f7\u8303\u56f4\uff0ccount\u4e3aIO\u8bf7\u6c42\u6b21\u6570\uff09\uff1b\u7531\u4e8eCPU\u4e0e\u786c\u4ef6\u961f\u5217\u4e4b\u95f4\u5b58\u5728\u6620\u5c04\u5173\u7cfb\uff0c\u4e00\u822c\u5730\u7528\u4e8e\u5206\u6790\uff0c\u78c1\u76d8\u961f\u5217IO\u5206\u5e03\u4e0d\u5747\u5300\u7684\u60c5\u51b5\uff0c\u89c2\u6d4b\u4e1a\u52a1IO\u662f\u5426\u8db3\u591f\u6253\u6563\u3002", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 55 + }, + "id": 81, + "options": { + "calculate": true, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "2m" + }, + "yBuckets": { + "mode": "size", + "value": "1" + } + }, + "cellGap": 0, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#cc0219", + "min": 0, + "mode": "opacity", + "reverse": true, + "scale": "linear", + "scheme": "Blues", + "steps": 33 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "CPUs", + "axisPlacement": "left", + "decimals": 1, + "min": 0, + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"initiated_cpu|issue_cpu\"}", + "legendFormat": "IO_Generate_Hot_Cpus", + "range": true, + "refId": "A" + } + ], + "title": "MySQL\u6bcfCPU\u751f\u4ea7IO\u8bf7\u6c42\u70ed\u529b\u5206\u5e03", + "type": "heatmap" + }, + { + "datasource": "sysom-prometheus", + "description": "\u54cd\u5e94\u78c1\u76d8\u5b8c\u6210IO\u7684\u4e2d\u65ad\u96c6\u4e2d\u5728\u54ea\u4e9bCPU\u4e0a\uff0c\u989c\u8272\u8d8a\u6df1\uff0c\u8bf4\u660e\u5728\u6b64CPU\u7f16\u53f7\u8303\u56f4\u5185\u5904\u7406\u7684IO\u4e2d\u65ad\u8d8a\u591a\u3002\u53ef\u4ee5\u89c2\u6d4b\u5b9e\u9645\u8fd0\u884c\u8fc7\u7a0b\u4e2d\u7684IO\u76f8\u5e94\u4e2d\u65ad\u662f\u5426\u7b26\u5408\u78c1\u76d8\u786c\u4ef6\u961f\u5217\u4e2d\u65ad\u7ed1\u6838\u671f\u671b\uff0c\u4e00\u822c\u5730\uff0c\u6bcf\u786c\u4ef6\u961f\u5217\u7ed1\u5b9a\u4e00\u4e2aCPU\u6838", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 55 + }, + "id": 82, + "options": { + "calculate": true, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "2m" + }, + "yBuckets": { + "mode": "size", + "value": "1" + } + }, + "cellGap": 1, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#e00a23", + "min": 0, + "mode": "opacity", + "reverse": true, + "scale": "linear", + "scheme": "Blues", + "steps": 33 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisLabel": "CPUs", + "axisPlacement": "left", + "decimals": 1, + "min": 0, + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_iolatency{ppid=\"$Pid\", exported_instance=\"$hostIP\", value=~\"soft_interrupt_cpu|respond_cpu\"}", + "legendFormat": "IO_Complete_Hot_Cpus", + "range": true, + "refId": "A" + } + ], + "title": "MySQL\u6bcfCPU\u5904\u7406IO\u4e2d\u65ad\u70ed\u529b\u5206\u5e03", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, "x": 0, - "y": 54 + "y": 63 }, "id": 24, "panels": [], - "title": "Mysql throughput details", + "title": "MySQL\u6d41\u91cf\u541e\u5410\u8be6\u60c5", "type": "row" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "The number of bytes read from or written to the device per second", "fieldConfig": { "defaults": { "color": { @@ -2587,12 +2910,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "", + "axisLabel": "bytes read (-) / write (+)", "axisPlacement": "auto", - "axisSoftMax": 4, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, @@ -2615,12 +2937,18 @@ "mode": "off" } }, + "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } ] }, @@ -2632,56 +2960,60 @@ "h": 8, "w": 12, "x": 0, - "y": 55 + "y": 64 }, - "id": 47, + "id": 20, + "links": [], "options": { "legend": { "calcs": [ - "min", "mean", + "lastNotNull", "max", - "lastNotNull" + "min" ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 0 + "showLegend": true }, "tooltip": { "mode": "multi", - "sort": "desc" + "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"netRecTraffic\"}", - "hide": false, + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"ioReadBps\"}", + "format": "time_series", "interval": "35", - "legendFormat": "netRecTraffic", + "intervalFactor": 4, + "legendFormat": "rBPS", "range": true, - "refId": "A" + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"netSendTraffic\"}", - "hide": false, + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"ioWriteBps\"}", + "format": "time_series", "interval": "35", - "legendFormat": "netSendTraffic", + "intervalFactor": 1, + "legendFormat": "wBPS", "range": true, - "refId": "B" + "refId": "B", + "step": 240 } ], - "title": "Mysql网络吞吐", + "title": "MySQL IO\u541e\u5410", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "mysql\u5b9e\u4f8b\u4ea7\u751f\u7684\u810f\u9875\u91cf\u4ee5\u53ca\u810f\u9875\u6c34\u4f4d\uff0c\u4e00\u822c\u5730\uff0c\u5f53\u810f\u9875\u91cf\u8d85\u8fc7\u810f\u9875\u6c34\u4f4d\u65f6\uff0c\u4f1a\u5bfc\u81f4mysql\u5199Buffer IO\u963b\u585e", "fieldConfig": { "defaults": { "color": { @@ -2692,6 +3024,7 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "axisSoftMax": 4, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, @@ -2702,11 +3035,8 @@ "viz": false }, "lineInterpolation": "smooth", - "lineStyle": { - "fill": "solid" - }, "lineWidth": 1, - "pointSize": 5, + "pointSize": 2, "scaleDistribution": { "type": "linear" }, @@ -2725,11 +3055,12 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, - "unit": "none" + "unit": "bytes" }, "overrides": [] }, @@ -2737,18 +3068,24 @@ "h": 8, "w": 12, "x": 12, - "y": 55 + "y": 64 }, - "id": 48, + "id": 56, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "width": 0 }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, @@ -2757,20 +3094,40 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"requestCount\"}", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"cgroupDirtyPages\"}", "hide": false, "interval": "35", - "legendFormat": "mysql requestCnt", + "legendFormat": "Dirty Pages", "range": true, "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"cgroupDirtyBlockThresh\"}", + "hide": false, + "interval": "35", + "legendFormat": "Dirty Thresh", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",mode=\"cgroupFlushPages\"}", + "hide": false, + "interval": "35", + "legendFormat": "Flush Pages", + "range": true, + "refId": "C" } ], - "title": "Mysql请求数", + "title": "MySQL OS\u810f\u9875\u91cf", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The number of bytes read from or written to the device per second", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -2779,11 +3136,12 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", + "axisLabel": "", "axisPlacement": "auto", + "axisSoftMax": 4, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -2806,17 +3164,13 @@ "mode": "off" } }, - "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" - }, - { - "color": "red", - "value": 80 + "color": "green", + "value": null } ] }, @@ -2828,55 +3182,51 @@ "h": 8, "w": 12, "x": 0, - "y": 63 + "y": 72 }, - "id": 20, - "links": [], + "id": 47, "options": { "legend": { "calcs": [ + "min", "mean", - "lastNotNull", "max", - "min" + "lastNotNull" ], "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "width": 0 }, "tooltip": { "mode": "multi", - "sort": "none" + "sort": "desc" } }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"ioReadBps\"}", - "format": "time_series", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"netRecTraffic\"}", + "hide": false, "interval": "35", - "intervalFactor": 4, - "legendFormat": "rBPS", + "legendFormat": "netRecTraffic", "range": true, - "refId": "A", - "step": 240 + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"ioWriteBps\"}", - "format": "time_series", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"netSendTraffic\"}", + "hide": false, "interval": "35", - "intervalFactor": 1, - "legendFormat": "wBPS", + "legendFormat": "netSendTraffic", "range": true, - "refId": "B", - "step": 240 + "refId": "B" } ], - "title": "Mysql IO吞吐", + "title": "MySQL\u7f51\u7edc\u541e\u5410", "type": "timeseries" }, { @@ -2892,7 +3242,6 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 4, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, @@ -2903,8 +3252,11 @@ "viz": false }, "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, - "pointSize": 2, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, @@ -2923,11 +3275,12 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, - "unit": "bytes" + "unit": "none" }, "overrides": [] }, @@ -2935,9 +3288,9 @@ "h": 8, "w": 12, "x": 12, - "y": 63 + "y": 72 }, - "id": 56, + "id": 48, "options": { "legend": { "calcs": [ @@ -2948,11 +3301,10 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 0 + "showLegend": true }, "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" } }, @@ -2961,35 +3313,15 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"cgroupDirtyPages\"}", + "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\", value=\"requestCount\"}", "hide": false, "interval": "35", - "legendFormat": "Dirty Pages", + "legendFormat": "mysql requestCnt", "range": true, "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",value=\"cgroupDirtyBlockThresh\"}", - "hide": false, - "interval": "35", - "legendFormat": "Dirty Thresh", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_obser_mysqld_process{containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\",mode=\"cgroupFlushPages\"}", - "hide": false, - "interval": "35", - "legendFormat": "Flush Pages", - "range": true, - "refId": "C" } ], - "title": "Mysql OS脏页量", + "title": "MySQL\u8bf7\u6c42\u6570", "type": "timeseries" } ], @@ -3002,8 +3334,8 @@ { "current": { "selected": false, - "text": "${podId}", - "value": "${podId}" + "text": "db0a32b0_7632_44c0_975a_f3bfc5b6c55e", + "value": "db0a32b0_7632_44c0_975a_f3bfc5b6c55e" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process, podID)", @@ -3025,8 +3357,8 @@ { "current": { "selected": false, - "text": "88202520f395", - "value": "88202520f395" + "text": "bcf1a8a097d5", + "value": "bcf1a8a097d5" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\"}, containerID)", @@ -3047,10 +3379,9 @@ }, { "current": { - "isNone": true, "selected": false, - "text": "None", - "value": "" + "text": "192.168.0.136", + "value": "192.168.0.136" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\"}, exported_instance)", @@ -3071,10 +3402,9 @@ }, { "current": { - "isNone": true, "selected": false, - "text": "None", - "value": "" + "text": "3306", + "value": "3306" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIP\"}, port)", @@ -3095,10 +3425,9 @@ }, { "current": { - "isNone": true, "selected": false, - "text": "None", - "value": "" + "text": "4411", + "value": "4411" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIP\",port=\"$Port\"}, pid)", @@ -3119,20 +3448,50 @@ }, { "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" + "selected": true, + "text": [ + "None" + ], + "value": [ + "" + ] }, "datasource": "sysom-prometheus", - "definition": "label_values(sysom_obser_app_rt_ntopo{PodUUID=\"$podId\"}, Pod)", - "hide": 0, + "definition": "label_values(sysom_ntopo_node{PodUUID=\"$podId\"}, Pod)", + "hide": 2, "includeAll": false, "multi": true, "name": "podname", "options": [], "query": { - "query": "label_values(sysom_obser_app_rt_ntopo{PodUUID=\"$podId\"}, Pod)", + "query": "label_values(sysom_ntopo_node{PodUUID=\"$podId\"}, Pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_iolatency{ppid=\"$Pid\"}, diskname)", + "hide": 2, + "includeAll": true, + "multi": true, + "name": "Disk_list", + "options": [], + "query": { + "query": "label_values(sysom_iolatency{ppid=\"$Pid\"}, diskname)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -3144,7 +3503,7 @@ ] }, "time": { - "from": "now-15m", + "from": "now-1h", "to": "now" }, "timepicker": { @@ -3163,9 +3522,9 @@ ] }, "timezone": "", - "title": "app-mysql", + "title": "app-mysql-server", "uid": "hOk70b34k", - "version": 15, + "version": 319, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-appobserver-mysql-events-dashboard.json b/deps/4_grafana/sysom-appobserver-mysql-events-dashboard.json index db2538b15645a14a26ef6d2db57c950d02075b95..9dd86965a0ab44f0bb9e9a2c07c969bc68d78741 100644 --- a/deps/4_grafana/sysom-appobserver-mysql-events-dashboard.json +++ b/deps/4_grafana/sysom-appobserver-mysql-events-dashboard.json @@ -43,8 +43,7 @@ "mode": "absolute", "steps": [ { - "color": "semi-dark-red", - "value": null + "color": "semi-dark-red" }, { "color": "red", @@ -105,7 +104,7 @@ "value": [ { "options": { - "无": { + "\u65e0": { "color": "text", "index": 0 } @@ -124,7 +123,7 @@ { "targetBlank": true, "title": "", - "url": "../${__data.fields.root_analyz_flag}instance=${__data.fields.instance}" + "url": "../${__data.fields.root_analyz_flag}\ufeff\ufeff\ufeff\ufeffinstance=${__data.fields.instance}" } ] } @@ -174,7 +173,7 @@ "properties": [ { "id": "custom.width", - "value": 132 + "value": 371 } ] }, @@ -205,6 +204,18 @@ "value": 410 } ] + }, + { + "matcher": { + "id": "byName", + "options": "port" + }, + "properties": [ + { + "id": "custom.width", + "value": 71 + } + ] } ] }, @@ -236,7 +247,7 @@ "format": "table", "hide": false, "rawQuery": true, - "rawSql": "SELECT extra, instance,\n CASE \n WHEN JSON_EXTRACT(extra, '$.root_analyz_flag') = 'journal/node?'\n THEN '无'\n ELSE '进一步诊断'\n END AS root_analyze\nFROM sysom.sys_node_log\nWHERE (JSON_EXTRACT(extra, '$.podId') = \"$podId\" OR JSON_EXTRACT(extra, '$.containerId') = \"$containerId\") AND instance = \"$hostIp\" AND UNIX_TIMESTAMP(create_at) > $__from / 1000 AND UNIX_TIMESTAMP(create_at) < $__to / 1000 AND JSON_EXTRACT(extra, '$.reason') IS NOT NULL AND JSON_EXTRACT(extra, '$.tag_set') = 'mysqld' \nORDER BY ts DESC LIMIT 20\n", + "rawSql": "SELECT annotations, instance,\n CASE \n WHEN JSON_EXTRACT(annotations, '$.root_analyz_flag') = 'journal/node?'\n THEN '\u65e0'\n ELSE '\u8fdb\u4e00\u6b65\u8bca\u65ad'\n END AS root_analyze\nFROM sysom.sys_alert_data\nWHERE (JSON_EXTRACT(annotations, '$.podId') = \"$podId\" OR JSON_EXTRACT(annotations, '$.containerId') = \"$containerId\") AND alert_time > $__from AND alert_time < $__to AND JSON_EXTRACT(annotations, '$.reason') IS NOT NULL AND JSON_EXTRACT(annotations, '$.tag_set') = 'mysqld' \nORDER BY alert_time DESC LIMIT 20\n", "refId": "A", "sql": { "columns": [ @@ -263,7 +274,7 @@ "table": "sys_node_log" } ], - "title": "Mysql应用相关异常事件", + "title": "Mysql\u5e94\u7528\u76f8\u5173\u5f02\u5e38\u4e8b\u4ef6", "transformations": [ { "disabled": true, @@ -310,7 +321,7 @@ "options": { "format": "json", "replace": false, - "source": "extra" + "source": "annotations" } }, { @@ -353,6 +364,7 @@ "id": "organize", "options": { "excludeByName": { + "annotations": true, "app_log": true, "containerId": true, "curr": true, @@ -370,18 +382,19 @@ "unit": true }, "indexByName": { - "app_log": 8, + "app_log": 9, "containerId": 7, - "extra": 11, - "instance": 12, + "extra": 12, + "instance": 13, + "level": 8, "metrics": 4, - "os_log": 10, + "os_log": 11, "pid": 6, "podId": 1, "port": 2, - "reason": 9, - "root_analyz_flag": 14, - "root_analyze": 13, + "reason": 10, + "root_analyz_flag": 15, + "root_analyze": 14, "tag_set": 5, "ts": 0, "value": 3 @@ -425,8 +438,7 @@ "mode": "absolute", "steps": [ { - "color": "semi-dark-red", - "value": null + "color": "semi-dark-red" }, { "color": "red", @@ -579,13 +591,13 @@ "value": [ { "targetBlank": true, - "title": "根因诊断", + "title": "\u6839\u56e0\u8bca\u65ad", "url": "../${__data.fields.root_analyz_flag}instance=${__data.fields.instance}" }, { "targetBlank": true, - "title": "指标关联分析", - "url": "../api/v1/rca/rca_call?timestamp=${__data.fields.ts}&base_item=${__data.fields.metrics}&machine_ip=${__data.fields.instance}" + "title": "\u6307\u6807\u5173\u8054\u5206\u6790", + "url": "../api/v1/rca/rca_call?timestamp=${__data.fields.ts}\ufeff&base_item=${__data.fields.metrics}&machine_ip=${__data.fields.instance}" } ] }, @@ -600,7 +612,7 @@ "options": { "responseTimeAvg": { "index": 0, - "text": "进一步诊断" + "text": "\u8fdb\u4e00\u6b65\u8bca\u65ad" } }, "type": "value" @@ -648,7 +660,7 @@ { "matcher": { "id": "byName", - "options": "extra" + "options": "annotations" }, "properties": [ { @@ -765,7 +777,7 @@ "properties": [ { "id": "custom.width", - "value": 361 + "value": 373 } ] }, @@ -792,6 +804,18 @@ "value": true } ] + }, + { + "matcher": { + "id": "byName", + "options": "port" + }, + "properties": [ + { + "id": "custom.width", + "value": 89 + } + ] } ] }, @@ -823,7 +847,7 @@ "format": "table", "hide": false, "rawQuery": true, - "rawSql": "SELECT *, '进一步诊断' as 'root cause analyze' FROM sysom.sys_node_log \nWHERE UNIX_TIMESTAMP(create_at) > (\n SELECT COALESCE(MAX(UNIX_TIMESTAMP(create_at)), '0000-00-00 00:00:00')\n FROM sysom.sys_node_log \n WHERE (JSON_EXTRACT(extra, '$.podId') = \"$podId\" OR JSON_EXTRACT(extra, '$.containerId') = \"$containerId\" AND instance = \"$hostIp\" ) \n AND UNIX_TIMESTAMP(create_at) > $__from / 1000 \n AND UNIX_TIMESTAMP(create_at) < $__to / 1000 \n AND JSON_EXTRACT(extra, '$.reason') IS NOT NULL \n AND JSON_EXTRACT(extra, '$.tag_set') = 'mysqld'\n)\nAND UNIX_TIMESTAMP(create_at) > $__from / 1000 \nAND UNIX_TIMESTAMP(create_at) < $__to / 1000 \nAND JSON_EXTRACT(extra, '$.tag_set') = 'mysqld'\nAND JSON_EXTRACT(extra, '$.app_log') IS NULL\nAND (JSON_EXTRACT(extra, '$.podId') = \"$podId\" OR JSON_EXTRACT(extra, '$.containerId') = \"$containerId\" AND instance = \"$hostIp\")\nORDER BY ts DESC \nLIMIT 20", + "rawSql": "SELECT alert_time, annotations, instance, '\u8fdb\u4e00\u6b65\u8bca\u65ad' as 'root cause analyze' FROM sysom.sys_alert_data \nWHERE alert_time > (\n SELECT COALESCE(MAX(alert_time / 1000), '0000-00-00 00:00:00') \n FROM sysom.sys_alert_data \n WHERE (JSON_EXTRACT(annotations, '$.podId') = \"$podId\" OR JSON_EXTRACT(annotations, '$.containerId') = \"$containerId\") \n AND alert_time > $__from \n AND alert_time < $__to\n AND JSON_EXTRACT(annotations, '$.reason') IS NOT NULL \n AND JSON_EXTRACT(annotations, '$.tag_set') = 'mysqld'\n)\nAND alert_time > $__from\nAND alert_time < $__to\nAND (JSON_EXTRACT(annotations, '$.tag_set') = 'mysqld' OR JSON_EXTRACT(annotations, '$.value') LIKE '%mysqld%')\nAND JSON_EXTRACT(annotations, '$.app_log') IS NULL\nAND (JSON_EXTRACT(annotations, '$.podId') = \"$podId\" OR JSON_EXTRACT(annotations, '$.containerId') = \"$containerId\" OR JSON_EXTRACT(annotations, '$.value') LIKE '%$Pid%')\nORDER BY alert_time DESC \nLIMIT 20", "refId": "A", "sql": { "columns": [ @@ -850,12 +874,13 @@ "table": "sys_node_log" } ], - "title": "Mysql OS相关异常事件", + "title": "Mysql OS\u76f8\u5173\u5f02\u5e38\u4e8b\u4ef6", "transformations": [ { "id": "organize", "options": { "excludeByName": { + "alert_time": true, "app_log": true, "containerId": true, "create_at": true, @@ -868,6 +893,7 @@ "os_log": true, "pid": false, "reason": true, + "root cause analyze": false, "tag_set": false, "ts": true, "unit": true, @@ -899,7 +925,7 @@ "options": { "format": "json", "replace": false, - "source": "extra" + "source": "annotations" } }, { @@ -966,23 +992,18 @@ "value": false }, "indexByName": { - "app_log": 11, - "containerId": 9, - "curr": 15, - "disk": 14, - "extra": 4, + "annotations": 10, + "containerId": 7, + "details": 12, "instance": 1, - "level": 10, - "metrics": 7, - "os_log": 13, - "pid": 3, + "level": 8, + "pid": 4, "podId": 2, - "reason": 12, - "root cause analyze": 17, - "tag_set": 8, - "thresh": 16, + "port": 3, + "root cause analyze": 9, + "root_analyz_flag": 11, + "tag_set": 6, "ts": 0, - "unit": 6, "value": 5 }, "renameByName": {} @@ -1004,7 +1025,7 @@ "type": "table" } ], - "refresh": "5s", + "refresh": "15s", "schemaVersion": 37, "style": "dark", "tags": [], @@ -1013,8 +1034,8 @@ { "current": { "selected": false, - "text": "NULL", - "value": "NULL" + "text": "36ec6dca_b5da_4b3b_b693_b6dbe4b8a9a6", + "value": "36ec6dca_b5da_4b3b_b693_b6dbe4b8a9a6" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process, podID)", @@ -1036,8 +1057,8 @@ { "current": { "selected": false, - "text": "88202520f395", - "value": "88202520f395" + "text": "a83df925aa86", + "value": "a83df925aa86" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\"}, containerID)", @@ -1059,18 +1080,18 @@ { "current": { "selected": false, - "text": "192.168.0.6", - "value": "192.168.0.6" + "text": "192.168.0.138", + "value": "192.168.0.138" }, "datasource": "sysom-prometheus", - "definition": "label_values(sysom_obser_mysqld_process{containerID=\"$containerId\"}, exported_instance)", + "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\"}, exported_instance)", "hide": 0, "includeAll": false, "multi": false, "name": "hostIp", "options": [], "query": { - "query": "label_values(sysom_obser_mysqld_process{containerID=\"$containerId\"}, exported_instance)", + "query": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\"}, exported_instance)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -1081,6 +1102,53 @@ }, { "current": { + "selected": false, + "text": "3306", + "value": "3306" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIp\"}, port)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Port", + "options": [], + "query": { + "query": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIp\"}, port)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "4110", + "value": "4110" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIp\",port=\"$Port\"}, pid)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Pid", + "options": [], + "query": { + "query": "label_values(sysom_obser_mysqld_process{podID=\"$podId\",containerID=\"$containerId\",exported_instance=\"$hostIp\",port=\"$Port\"}, pid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, "selected": false, "text": "None", "value": "" @@ -1105,14 +1173,28 @@ ] }, "time": { - "from": "now-15m", + "from": "now-30m", "to": "now" }, - "timepicker": {}, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "15s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, "timezone": "", "title": "app-mysql-events", "uid": "Ub__1x3Vz", - "version": 4, + "version": 107, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-appobserver-nginx-dashboard.json b/deps/4_grafana/sysom-appobserver-nginx-dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..78b411be93c942d91cc42323a53607ece972ba93 --- /dev/null +++ b/deps/4_grafana/sysom-appobserver-nginx-dashboard.json @@ -0,0 +1,1088 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 8, + "panels": [], + "title": "\u5f02\u5e38\u4e8b\u4ef6", + "type": "row" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx\u670d\u52a1\u5728OS\u4e2d\u88ab\u89c2\u6d4b\u5230\u53ef\u80fd\u5b58\u5728\u7684\u5f02\u5e38\u544a\u8b66\u4e8b\u4ef6\u7edf\u8ba1\u4fe1\u606f", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u8bf7\u6c42\u6296\u52a8" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "", + "url": "../grafana/d/HtuWUeSSz/nginx-event?var-instance=${instance}&var-masterPid=${masterPid}" + } + ] + }, + { + "id": "unit", + "value": "none" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u8bf7\u6c424xx" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "../grafana/d/HtuWUeSSz/nginx-event?var-instance=${instance}&var-masterPid=${masterPid}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u8bf7\u6c425xx" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "../grafana/d/HtuWUeSSz/nginx-event?var-instance=${instance}&var-masterPid=${masterPid}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u9519\u8bef\u65e5\u5fd7" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "../grafana/d/HtuWUeSSz/nginx-event?var-instance=${instance}&var-masterPid=${masterPid}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 18, + "links": [], + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "round(sum(increase(sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"requestJitter\"}[$__range])))", + "hide": false, + "interval": "35", + "legendFormat": "\u8bf7\u6c42\u6296\u52a8", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "round(sum(increase(sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_4xx\"}[$__range])))", + "hide": false, + "interval": "35", + "legendFormat": "\u8bf7\u6c424xx", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "round(sum(increase(sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_5xx\"}[$__range])))", + "hide": false, + "interval": "35", + "legendFormat": "\u8bf7\u6c425xx", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"errorLog\"}", + "hide": false, + "legendFormat": "\u9519\u8bef\u65e5\u5fd7", + "range": true, + "refId": "D" + } + ], + "title": "\u5f02\u5e38\u544a\u8b66\u5206\u5e03\uff08\u6b21\u6570\uff09", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "panels": [], + "title": "\u5e94\u7528\u6307\u6807", + "type": "row" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx\u7684\u8bf7\u6c42\u6570", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"requests\"}", + "interval": "35", + "legendFormat": "requests", + "range": true, + "refId": "A" + } + ], + "title": "\u8bf7\u6c42\u6570", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "http\u8bf7\u6c42\u7684status\u5206\u5e03\u60c5\u51b5", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_1xx\"}", + "interval": "35", + "legendFormat": "status_1xx", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_2xx\"}", + "hide": false, + "interval": "35", + "legendFormat": "status_2xx", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_3xx\"}", + "hide": false, + "interval": "35", + "legendFormat": "status_3xx", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_4xx\"}", + "hide": false, + "interval": "35", + "legendFormat": "status_4xx", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"status_5xx\"}", + "hide": false, + "interval": "35", + "legendFormat": "status_5xx", + "range": true, + "refId": "E" + } + ], + "title": "http status\u5206\u5e03", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u54cd\u5e94\u65f6\u5ef6\uff0c\u5305\u62ecrequest\u548cupstream", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"requestTimeAvg\"}", + "interval": "35", + "legendFormat": "requestTime", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"upstreamTimeAvg\"}", + "hide": false, + "interval": "35", + "legendFormat": "upstreamTime", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"maxRequestTime\"}", + "hide": false, + "interval": "35", + "legendFormat": "maxRequestTime", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"maxUpstreamTime\"}", + "hide": false, + "interval": "35", + "legendFormat": "maxUpstreamTime", + "range": true, + "refId": "D" + } + ], + "title": "\u54cd\u5e94\u65f6\u5ef6", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx workers\u7684\u6570\u91cf", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"workersCount\"}", + "interval": "35", + "legendFormat": "workersCount", + "range": true, + "refId": "A" + } + ], + "title": "workers\u6570\u91cf", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx\u5f53\u524d\u6d3b\u8dc3\u7684\u8fde\u63a5\u6570", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "activeConnections" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"activeConnections\"}", + "interval": "35", + "legendFormat": "activeConnections", + "range": true, + "refId": "A" + } + ], + "title": "\u6d3b\u8dc3\u7684\u8fde\u63a5\u6570", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 4, + "panels": [], + "title": "\u7cfb\u7edf\u6307\u6807", + "type": "row" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx\u7684cpu\u5229\u7528\u7387", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_worker_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"cpuTot\"}", + "interval": "35", + "legendFormat": "{{pid}}", + "range": true, + "refId": "A" + } + ], + "title": "nginx\u8fdb\u7a0bcpu\u5229\u7528\u7387", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "nginx\u7684\u5185\u5b58\u5229\u7528\u7387", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_worker_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"memPct\"}", + "interval": "35", + "legendFormat": "{{pid}}", + "range": true, + "refId": "A" + } + ], + "title": "nginx\u8fdb\u7a0b\u5185\u5b58\u5229\u7528\u7387", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u663e\u793a\u7f51\u7edc\u6d41\u91cf\u4fe1\u606f", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_worker_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"inBytes\"}", + "interval": "35", + "legendFormat": "{{pid}}", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_nginx_main_metrics{exported_instance=\"$instance\", masterPid=\"$masterPid\", value=\"outBytes\"}", + "hide": false, + "interval": "35", + "legendFormat": "{{pid}}", + "range": true, + "refId": "B" + } + ], + "title": "nginx\u8fdb\u7a0b\u7f51\u7edc\u6d41\u91cf", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "192.168.0.77", + "value": "192.168.0.77" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_nginx_main_metrics, exported_instance)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(sysom_nginx_main_metrics, exported_instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "809", + "value": "809" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_nginx_main_metrics, masterPid)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "masterPid", + "options": [], + "query": { + "query": "label_values(sysom_nginx_main_metrics, masterPid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "nginx", + "uid": "6Mztrm4Ik", + "version": 86, + "weekStart": "" + } +} \ No newline at end of file diff --git a/deps/4_grafana/sysom-appobserver-nginx-events-dashboard.json b/deps/4_grafana/sysom-appobserver-nginx-events-dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..9e083221491aa738d58bb18fa2a06f080798f72f --- /dev/null +++ b/deps/4_grafana/sysom-appobserver-nginx-events-dashboard.json @@ -0,0 +1,452 @@ +{ + "dashboard": { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [], + "title": "\u5f02\u5e38\u4e8b\u4ef6\u8be6\u60c5", + "type": "row" + }, + { + "datasource": "sysom-mysql", + "description": "nginx http\u5f02\u5e38\u8bf7\u6c42\u8be6\u60c5", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red" + } + ] + }, + "unit": "\u00b5s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ts" + }, + "properties": [ + { + "id": "displayName", + "value": "time" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "labels" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "masterPid" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "diagId" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "status" + }, + "properties": [ + { + "id": "unit", + "value": "string" + }, + { + "id": "custom.width", + "value": 166 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "requestTime" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.width", + "value": 158 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "upstreamResponseTime" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.width", + "value": 241 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "upstreamAddr" + }, + "properties": [ + { + "id": "custom.width", + "value": 191 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u65f6\u95f4" + }, + "properties": [ + { + "id": "custom.width", + "value": 190 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "remoteAddr" + }, + "properties": [ + { + "id": "custom.width", + "value": 173 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "custom.width", + "value": 287 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "reason" + }, + "properties": [ + { + "id": "custom.width", + "value": 259 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "app_diag" + }, + "properties": [ + { + "id": "displayName", + "value": "applicationDiagnosis" + }, + { + "id": "custom.displayMode", + "value": "color-text" + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "http://sysom_test.qjm253.cn/diagnose/link/procdiag?instance=${instance}&ipport=${__data.fields.upstreamAddr}&time=30" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "diag" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "applicationDiagnosis" + }, + "properties": [ + { + "id": "custom.width", + "value": 243 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "time" + }, + "properties": [ + { + "id": "custom.width", + "value": 272 + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "footer": { + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "expr": "", + "format": "table", + "legendFormat": "__auto", + "range": true, + "rawQuery": true, + "rawSql": "SELECT labels, instance,\n CASE \n WHEN JSON_EXTRACT(labels, '$.diag') = 'true'\n THEN '\u5e94\u7528\u6296\u52a8\u8bca\u65ad'\n ELSE '\u65e0'\n END AS app_diag\nFROM sysom.sys_alert_data\nWHERE instance = \"$instance\" AND JSON_EXTRACT(labels, '$.masterPid') = \"$masterPid\" AND UNIX_TIMESTAMP(created_at) > $__from / 1000 AND UNIX_TIMESTAMP(created_at) < $__to / 1000\nORDER BY alert_time DESC LIMIT 20\n", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "\u5f02\u5e38\u8bf7\u6c42\u8be6\u60c5", + "transformations": [ + { + "id": "extractFields", + "options": { + "source": "labels" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "app_diag": 13, + "diag": 12, + "diagId": 9, + "instance": 1, + "labels": 0, + "masterPid": 8, + "reason": 11, + "remoteAddr": 3, + "request": 5, + "requestTime": 6, + "status": 10, + "ts": 2, + "upstreamAddr": 4, + "upstreamResponseTime": 7 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "192.168.0.77", + "value": "192.168.0.77" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_nginx_main_metrics, exported_instance)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "instance", + "options": [], + "query": { + "query": "label_values(sysom_nginx_main_metrics, exported_instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "809", + "value": "809" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_nginx_main_metrics, masterPid)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "masterPid", + "options": [], + "query": { + "query": "label_values(sysom_nginx_main_metrics, masterPid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "nginx-event", + "uid": "HtuWUeSSz", + "version": 25, + "weekStart": "" + } +} \ No newline at end of file diff --git a/deps/4_grafana/sysom-appobserver-process-dashboard.json b/deps/4_grafana/sysom-appobserver-process-dashboard.json index 90e5718371fa9edb10f6d1a23f834eebddf595a4..d50b5840229c82524597d9663d1ae59c5c45a9a3 100644 --- a/deps/4_grafana/sysom-appobserver-process-dashboard.json +++ b/deps/4_grafana/sysom-appobserver-process-dashboard.json @@ -442,7 +442,7 @@ "refId": "B" } ], - "title": "cpu占用率", + "title": "cpu占比", "type": "piechart" }, { diff --git a/deps/4_grafana/sysom-cluster-dashboard.json b/deps/4_grafana/sysom-cluster-dashboard.json index 644ad2b4f35b6ffc4c203d653728ce7d48d65b5a..07f78cb16ee0fb57817f37be3175109065c72086 100644 --- a/deps/4_grafana/sysom-cluster-dashboard.json +++ b/deps/4_grafana/sysom-cluster-dashboard.json @@ -34,13 +34,14 @@ "x": 0, "y": 0 }, - "id": 60, + "id": 74, "panels": [], - "title": "集群资源概览", + "title": "\u96c6\u7fa4\u5065\u5eb7\u5ea6\u6982\u89c8", "type": "row" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -48,15 +49,27 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "blue", + "color": "text", "value": null }, { - "color": "red", - "value": 1000 + "color": "#E24D42", + "value": 0 + }, + { + "color": "#EF843C", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "dark-green", + "value": 100 } ] } @@ -64,17 +77,15 @@ "overrides": [] }, "gridPos": { - "h": 7, - "w": 4, + "h": 5, + "w": 5, "x": 0, "y": 1 }, - "id": 4, + "id": 76, + "interval": "30s", "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" @@ -82,179 +93,86 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "showThresholdLabels": false, + "showThresholdMarkers": true }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": true, - "expr": "count(sysom_proc_cpus{mode=\"nice\"})", - "interval": "", - "legendFormat": "cores", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\",type=\"total\"}", + "legendFormat": "{{exported_instance}}", "range": true, "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "count(sysom_proc_cpu_total{mode=\"nice\"})", - "hide": false, - "interval": "", - "legendFormat": "nodes", - "range": true, - "refId": "B" } ], - "title": "集群总CPU核数/节点数", - "type": "stat" + "title": "Cluster Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } + "mode": "thresholds" }, - "mappings": [] - }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", + "decimals": 0, + "mappings": [ + { "options": { - "mode": "exclude", - "names": [ - "sum(sysak_proc_cpu_total{mode=\"user\"})", - "sum(sysak_proc_cpu_total{mode=\"nice\"})", - "sum(sysak_proc_cpu_total{mode=\"sys\"})", - "sum(sysak_proc_cpu_total{mode=\"softirq\"})", - "sum(sysak_proc_cpu_total{mode=\"iowait\"})", - "iowait", - "user", - "sys", - "softirq", - "idle" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true + "match": "null", + "result": { + "text": "N/A" } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "idle" - }, - "properties": [ + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "super-light-green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "user" - }, - "properties": [ + "color": "text", + "value": null + }, { - "id": "color", - "value": { - "fixedColor": "dark-yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "sys" - }, - "properties": [ + "color": "semi-dark-red", + "value": 0 + }, { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "softirq" - }, - "properties": [ + "color": "orange", + "value": 60 + }, { - "id": "color", - "value": { - "fixedColor": "dark-purple", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "iowait" - }, - "properties": [ + "color": "dark-yellow", + "value": 80 + }, { - "id": "color", - "value": { - "fixedColor": "dark-blue", - "mode": "fixed" - } + "color": "green", + "value": 100 } ] - } - ] + }, + "unit": "none" + }, + "overrides": [] }, "gridPos": { - "h": 7, - "w": 8, - "x": 4, + "h": 5, + "w": 5, + "x": 5, "y": 1 }, - "id": 41, + "id": 78, + "interval": "30s", + "links": [], + "maxDataPoints": 100, "options": { - "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true, - "values": [ - "percent" - ] - }, - "pieType": "pie", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -262,70 +180,27 @@ "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, - "pluginVersion": "8.3.1", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_cpu_total{mode=\"user\"} + on(instance)sysom_proc_cpu_total{mode=\"nice\"})", - "interval": "", - "legendFormat": "user", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_cpu_total{mode=\"sys\"})", - "hide": false, - "interval": "", - "legendFormat": "sys", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_cpu_total{mode=\"softirq\"})", - "hide": false, - "interval": "", - "legendFormat": "softirq", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_cpu_total{mode=\"iowait\"})", - "hide": false, - "interval": "", - "legendFormat": "iowait", - "range": true, - "refId": "E" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_cpu_total{mode=\"idle\"})", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"error\"}", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "idle", - "range": true, - "refId": "F" + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "title": "集群CPU利用率分布", - "type": "piechart" + "title": "Errors Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -333,157 +208,63 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } - }, - "mappings": [] - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "idle" - }, - "properties": [ + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "super-light-green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "user" - }, - "properties": [ + "color": "blue", + "value": null + }, { - "id": "color", - "value": { - "fixedColor": "dark-yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "sys" - }, - "properties": [ + "color": "red", + "value": 0 + }, { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "softirq" - }, - "properties": [ + "color": "#EAB839", + "value": 60 + }, { - "id": "color", - "value": { - "fixedColor": "dark-purple", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "iowait" - }, - "properties": [ + "color": "orange", + "value": 80 + }, { - "id": "color", - "value": { - "fixedColor": "dark-blue", - "mode": "fixed" - } + "color": "green", + "value": 100 } ] }, - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "sum(sysak_proc_cpu_total{mode=\"user\"})", - "sum(sysak_proc_cpu_total{mode=\"nice\"})", - "sum(sysak_proc_cpu_total{mode=\"sys\"})", - "sum(sysak_proc_cpu_total{mode=\"softirq\"})", - "sum(sysak_proc_cpu_total{mode=\"iowait\"})", - "iowait", - "user", - "sys", - "softirq", - "idle", - "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.67:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", - "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.68:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", - "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.69:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", - "sum(sysak_proc_meminfo{value=\"kernel_reserved\"})", - "user_used", - "kernel_reserved", - "free", - "app_used", - "kernel_used", - "app", - "reserved", - "kernel" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "unit": "none" + }, + "overrides": [] }, "gridPos": { - "h": 7, - "w": 7, - "x": 12, + "h": 5, + "w": 5, + "x": 10, "y": 1 }, - "id": 15, + "id": 80, + "interval": "30s", + "links": [], + "maxDataPoints": 100, "options": { - "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true, - "values": [ - "percent" - ] - }, - "pieType": "pie", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -491,68 +272,142 @@ "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, - "pluginVersion": "8.3.1", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_meminfo{value=\"kernel_reserved\"})", - "interval": "", - "legendFormat": "reserved", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_meminfo{value=\"user_used\"})", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"latency\"}", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "app", - "range": true, - "refId": "B" + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Latency Health", + "type": "gauge" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "dark-yellow", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "none" }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_meminfo{value=\"MemFree\"})", - "hide": false, - "interval": "", - "legendFormat": "free", - "range": true, - "refId": "C" + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 15, + "y": 1 + }, + "id": 82, + "interval": "30s", + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.2", + "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_meminfo{value=\"kernel_used\"} - on(instance)sysom_proc_meminfo{value=\"kernel_reserved\"})", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"capacity\"}", + "format": "time_series", "hide": false, + "instant": false, "interval": "", - "legendFormat": "kernel", - "range": true, - "refId": "D" + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "title": "集群内存使用分布", - "type": "piechart" + "title": "Saturation Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "mappings": [], + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -562,26 +417,38 @@ }, { "color": "red", - "value": 1000000000000 + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "green", + "value": 100 } ] }, - "unit": "bytes" + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 5, - "x": 19, + "h": 5, + "w": 4, + "x": 20, "y": 1 }, - "id": 17, + "id": 84, + "interval": "30s", + "links": [], + "maxDataPoints": 100, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -589,109 +456,525 @@ "fields": "", "values": false }, - "textMode": "auto" + "showThresholdLabels": false, + "showThresholdMarkers": false }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": true, - "expr": "sum(sysom_proc_meminfo{value=\"MemTotal\"} * 1024)", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\",type=\"load\"}", + "format": "time_series", + "hide": false, + "instant": false, "interval": "", + "intervalFactor": 1, "legendFormat": "", - "range": true, - "refId": "A" + "refId": "A", + "step": 20 } ], - "title": "集群内存总量", - "type": "stat" + "title": "Load(Traffic) Health", + "type": "gauge" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + { + "id": "custom.width", + "value": 274 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u5f02\u5e38\u6307\u6807" + }, + "properties": [ + { + "id": "custom.width", + "value": 199 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 229 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u7c7b\u522b" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 10, + "x": 0, + "y": 6 + }, + "id": 94, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nmetric_type,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE cluster = '$cluster'\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Nodes Overview", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "metric_type": 2, + "score": 3, + "value": 4 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "metric_type": "\u6307\u6807\u7c7b\u522b", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206", + "value": "\u6307\u6807\u503c" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "datasource": "sysom-mysql", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } + "align": "auto", + "displayMode": "auto", + "inspect": false }, - "mappings": [] + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } }, "overrides": [ { "matcher": { "id": "byName", - "options": "ratelimit callbacks suppressed" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", "value": { - "fixedColor": "super-light-purple", - "mode": "fixed" + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] } + }, + { + "id": "custom.width", + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "异常节点" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "orange", - "mode": "fixed" - } + "id": "custom.width", + "value": 199 } ] }, { "matcher": { "id": "byName", - "options": "正常节点" + "options": "node" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 229 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u7c7b\u522b" + }, + "properties": [ + { + "id": "custom.width", + "value": 120 } ] } ] }, + "gridPos": { + "h": 9, + "w": 14, + "x": 10, + "y": 6 + }, + "id": 95, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "pod" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\npod,\nnamespace,\nmetric_type,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE cluster = '$cluster'\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Pods Overview", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": false, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "metric_type": 4, + "namespace": 3, + "pod": 2, + "score": 5, + "value": 6 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "metric_type": "\u6307\u6807\u7c7b\u522b", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206", + "value": "\u6307\u6807\u503c" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 60, + "panels": [], + "title": "\u96c6\u7fa4\u8d44\u6e90\u6982\u89c8", + "type": "row" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 1000 + } + ] + } + }, + "overrides": [] + }, "gridPos": { "h": 7, - "w": 6, + "w": 4, "x": 0, - "y": 8 + "y": 16 }, - "id": 67, + "id": 4, "options": { - "displayLabels": [ - "value" - ], - "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true, - "values": [ - "percent" - ] - }, - "pieType": "pie", + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -699,10 +982,7 @@ "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "textMode": "value_and_name" }, "pluginVersion": "9.2.2", "targets": [ @@ -710,31 +990,29 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "count(sysom_net_tcp_count{value=\"RetransSegs\"}/on(instance)sysom_net_tcp_count{value=\"OutSegs\"}+0.00001>0.005 or topk(1,sysom_net_tcp_count{value=\"RetransSegs\"}))-1", - "hide": false, + "expr": "count(sysom_proc_cpus{mode=\"nice\"})", "interval": "", - "legendFormat": "异常节点", + "legendFormat": "cores", "range": true, - "refId": "F" + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "count(sysom_proc_meminfo{value=\"kernel_used\"}) - count(sysom_net_tcp_count{value=\"RetransSegs\"}/on(instance)sysom_net_tcp_count{value=\"OutSegs\"}+0.00001>0.005 or topk(1,sysom_net_tcp_count{value=\"RetransSegs\"})) + 1", + "expr": "count(sysom_proc_cpu_total{mode=\"nice\"})", "hide": false, "interval": "", - "legendFormat": "正常节点", + "legendFormat": "nodes", "range": true, "refId": "B" } ], - "title": "集群网络健康度", - "type": "piechart" + "title": "\u96c6\u7fa4\u603bCPU\u6838\u6570/\u8282\u70b9\u6570", + "type": "stat" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { @@ -751,16 +1029,34 @@ }, "overrides": [ { + "__systemRef": "hideSeriesFrom", "matcher": { - "id": "byName", - "options": "ratelimit callbacks suppressed" + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "sum(sysak_proc_cpu_total{mode=\"user\"})", + "sum(sysak_proc_cpu_total{mode=\"nice\"})", + "sum(sysak_proc_cpu_total{mode=\"sys\"})", + "sum(sysak_proc_cpu_total{mode=\"softirq\"})", + "sum(sysak_proc_cpu_total{mode=\"iowait\"})", + "iowait", + "user", + "sys", + "softirq", + "idle" + ], + "prefix": "All except:", + "readOnly": true + } }, "properties": [ { - "id": "color", + "id": "custom.hideFrom", "value": { - "fixedColor": "super-light-purple", - "mode": "fixed" + "legend": false, + "tooltip": false, + "viz": true } } ] @@ -768,13 +1064,13 @@ { "matcher": { "id": "byName", - "options": "正常节点" + "options": "idle" }, "properties": [ { "id": "color", "value": { - "fixedColor": "green", + "fixedColor": "super-light-green", "mode": "fixed" } } @@ -783,108 +1079,28 @@ { "matcher": { "id": "byName", - "options": "异常节点" + "options": "user" }, "properties": [ { "id": "color", "value": { - "fixedColor": "orange", + "fixedColor": "dark-yellow", "mode": "fixed" } } ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 8 - }, - "id": 56, - "options": { - "displayLabels": [ - "value" - ], - "legend": { - "displayMode": "list", - "placement": "right", - "showLegend": true, - "values": [ - "percent" - ] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "count((sysom_proc_meminfo{value=\"kernel_used\"} > on(instance)(0.3*sysom_proc_meminfo{value=\"total\"})) or (sum(rate(sysom_cg_memgdrcm_latency{value=\"memDrcm_glb_lat_total\"}[$__rate_interval])) > 0) or (topk(1,sysom_net_tcp_count{value=\"RetransSegs\"}))) - 1", - "hide": false, - "interval": "", - "legendFormat": "异常节点", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": true, - "expr": "count(sysom_proc_meminfo{value=\"kernel_used\"}) - count((sysom_proc_meminfo{value=\"kernel_used\"} > on(instance)(0.3*sysom_proc_meminfo{value=\"total\"})) or (sum(rate(sysom_cg_memgdrcm_latency{value=\"memDrcm_glb_lat_total\"}[$__rate_interval])) > 0) or (topk(1,sysom_net_tcp_count{value=\"RetransSegs\"}))) + 1", - "hide": false, - "interval": "", - "legendFormat": "正常节点", - "range": true, - "refId": "A" - } - ], - "title": "集群内存健康度", - "type": "piechart" - }, - { - "datasource": "sysom-prometheus", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } }, - "mappings": [] - }, - "overrides": [ { "matcher": { "id": "byName", - "options": "ratelimit callbacks suppressed" + "options": "sys" }, "properties": [ { "id": "color", "value": { - "fixedColor": "super-light-purple", + "fixedColor": "red", "mode": "fixed" } } @@ -893,13 +1109,13 @@ { "matcher": { "id": "byName", - "options": "异常节点" + "options": "softirq" }, "properties": [ { "id": "color", "value": { - "fixedColor": "orange", + "fixedColor": "dark-purple", "mode": "fixed" } } @@ -908,13 +1124,13 @@ { "matcher": { "id": "byName", - "options": "正常节点" + "options": "iowait" }, "properties": [ { "id": "color", "value": { - "fixedColor": "green", + "fixedColor": "dark-blue", "mode": "fixed" } } @@ -924,15 +1140,12 @@ }, "gridPos": { "h": 7, - "w": 6, - "x": 12, - "y": 8 + "w": 8, + "x": 4, + "y": 16 }, - "id": 69, + "id": 41, "options": { - "displayLabels": [ - "value" - ], "legend": { "displayMode": "list", "placement": "right", @@ -954,34 +1167,69 @@ "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "8.3.1", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "count(sysom_proc_cpu_total{mode=\"sys\"}>45) or count(max(rate(sysom_proc_schedstat{value=\"delay\"}[$__rate_interval]))by(instance) > 200000000) or count(topk(1, sysom_proc_cpu_total{mode=\"idle\"}))-1", + "expr": "sum(sysom_proc_cpu_total{mode=\"user\"} + on(instance)sysom_proc_cpu_total{mode=\"nice\"})", + "interval": "", + "legendFormat": "user", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_cpu_total{mode=\"sys\"})", "hide": false, "interval": "", - "legendFormat": "异常节点", + "legendFormat": "sys", "range": true, - "refId": "B" + "refId": "C" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "count(sysom_proc_cpu_total{mode=\"sys\"})-(count(sysom_proc_cpu_total{mode=\"sys\"}>45) or count(max(rate(sysom_proc_schedstat{value=\"delay\"}[$__rate_interval]))by(instance) > 200000000) or count(topk(1, sysom_proc_cpu_total{mode=\"idle\"}))-1)", + "exemplar": true, + "expr": "sum(sysom_proc_cpu_total{mode=\"softirq\"})", "hide": false, - "legendFormat": "正常节点", + "interval": "", + "legendFormat": "softirq", "range": true, - "refId": "A" + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_cpu_total{mode=\"iowait\"})", + "hide": false, + "interval": "", + "legendFormat": "iowait", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_cpu_total{mode=\"idle\"})", + "hide": false, + "interval": "", + "legendFormat": "idle", + "range": true, + "refId": "F" } ], - "title": "集群CPU健康度", + "title": "\u96c6\u7fa4CPU\u5229\u7528\u7387\u5206\u5e03", "type": "piechart" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -1030,13 +1278,28 @@ { "matcher": { "id": "byName", - "options": "正常节点" + "options": "sys" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "softirq" }, "properties": [ { "id": "color", "value": { - "fixedColor": "green", + "fixedColor": "dark-purple", "mode": "fixed" } } @@ -1045,31 +1308,73 @@ { "matcher": { "id": "byName", - "options": "异常节点" + "options": "iowait" }, "properties": [ { "id": "color", "value": { - "fixedColor": "orange", + "fixedColor": "dark-blue", "mode": "fixed" } } ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "sum(sysak_proc_cpu_total{mode=\"user\"})", + "sum(sysak_proc_cpu_total{mode=\"nice\"})", + "sum(sysak_proc_cpu_total{mode=\"sys\"})", + "sum(sysak_proc_cpu_total{mode=\"softirq\"})", + "sum(sysak_proc_cpu_total{mode=\"iowait\"})", + "iowait", + "user", + "sys", + "softirq", + "idle", + "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.67:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", + "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.68:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", + "sysak_proc_meminfo{exported_instance=\"i-2ze2py754b42by09ig1n\", instance=\"192.168.57.69:32760\", job=\"cluster1\", value=\"kernel_reserved\"}", + "sum(sysak_proc_meminfo{value=\"kernel_reserved\"})", + "user_used", + "kernel_reserved", + "free", + "app_used", + "kernel_used", + "app", + "reserved", + "kernel" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] } ] }, "gridPos": { "h": 7, - "w": 6, - "x": 18, - "y": 8 + "w": 7, + "x": 12, + "y": 16 }, - "id": 66, + "id": 15, "options": { - "displayLabels": [ - "value" - ], "legend": { "displayMode": "list", "placement": "right", @@ -1097,9 +1402,9 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "count(sysom_net_tcp_count{value=\"RetransSegs\"}/on(instance)sysom_net_tcp_count{value=\"OutSegs\"}+0.00001>0.005 or topk(1,sysom_net_tcp_count{value=\"RetransSegs\"}))-1", + "expr": "sum(sysom_proc_meminfo{value=\"kernel_reserved\"})", "interval": "", - "legendFormat": "异常节点", + "legendFormat": "reserved", "range": true, "refId": "A" }, @@ -1107,17 +1412,101 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "count(sysom_proc_meminfo{value=\"kernel_used\"}) - count(sysom_net_tcp_count{value=\"RetransSegs\"}/on(instance)sysom_net_tcp_count{value=\"OutSegs\"}+0.00001>0.005 or topk(1,sysom_net_tcp_count{value=\"RetransSegs\"})) + 1", + "expr": "sum(sysom_proc_meminfo{value=\"user_used\"})", "hide": false, "interval": "", - "legendFormat": "正常节点", + "legendFormat": "app", "range": true, - "refId": "F" + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_meminfo{value=\"MemFree\"})", + "hide": false, + "interval": "", + "legendFormat": "free", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_meminfo{value=\"kernel_used\"} - on(instance)sysom_proc_meminfo{value=\"kernel_reserved\"})", + "hide": false, + "interval": "", + "legendFormat": "kernel", + "range": true, + "refId": "D" } ], - "title": "集群存储健康度", + "title": "\u96c6\u7fa4\u5185\u5b58\u4f7f\u7528\u5206\u5e03", "type": "piechart" }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 1000000000000 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 19, + "y": 16 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": true, + "expr": "sum(sysom_proc_meminfo{value=\"MemTotal\"} * 1024)", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "\u96c6\u7fa4\u5185\u5b58\u603b\u91cf", + "type": "stat" + }, { "datasource": "sysom-prometheus", "description": "", @@ -1159,8 +1548,8 @@ "links": [ { "targetBlank": true, - "title": "内存异常诊断中心", - "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&time=${__data.fields.Time}&diagnosis_type=内存延时诊断" + "title": "\u5185\u5b58\u5f02\u5e38\u8bca\u65ad\u4e2d\u5fc3", + "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}\ufeff\ufeff\ufeff&time=\ufeff\ufeff\ufeff${__data.fields.Time}\ufeff\ufeff\ufeff&diagnosis_type=\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad" } ], "mappings": [], @@ -1213,7 +1602,7 @@ "h": 8, "w": 12, "x": 0, - "y": 15 + "y": 23 }, "id": 61, "options": { @@ -1237,7 +1626,7 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "sum(rate(sysom_container_memgdrcm_latency{pod!=\"\", value=\"memDrcm_glb_lat_total\"}[$__rate_interval])) by (instance)", + "expr": "sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_1000ms\"}[$__rate_interval])) by (instance) + sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_10to100ms\"}[$__rate_interval])) by (instance) + sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_500to1000ms\"}[$__rate_interval])) by (instance)", "hide": false, "interval": "", "legendFormat": "{{instance}}", @@ -1245,7 +1634,7 @@ "refId": "B" } ], - "title": "节点内存延时诊断", + "title": "\u8282\u70b9\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad", "type": "timeseries" }, { @@ -1289,8 +1678,8 @@ "links": [ { "targetBlank": true, - "title": "集群内存诊断中心", - "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&pod_name=${__field.labels.podname}&time=${__data.fields.Time}&diagnosis_type=内存延时诊断" + "title": "\u96c6\u7fa4\u5185\u5b58\u8bca\u65ad\u4e2d\u5fc3", + "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&pod_name=${__field.labels.podname}&time=${__data.fields.Time}&diagnosis_type=\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad" } ], "mappings": [], @@ -1343,7 +1732,7 @@ "h": 8, "w": 12, "x": 12, - "y": 15 + "y": 23 }, "id": 62, "interval": "30s", @@ -1368,20 +1757,15 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": true, - "expr": "sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_total\"}[$__rate_interval])) by (pod, instance)\n#+on(podname)sysom_cg_memgdrcm_latency{value=\"memDrcm_glb_lat_total\"}", + "expr": "sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_10to100ms\"}[$__rate_interval])) by (pod, instance) + sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_100to500ms\"}[$__rate_interval])) by (pod, instance) + sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_500to1000ms\"}[$__rate_interval])) by (pod, instance) + sum(rate(sysom_container_memdrcm_latency{pod!=\"\", value=\"memDrcm_lat_1000ms\"}[$__rate_interval])) by (pod, instance)", "hide": false, "interval": "", "legendFormat": "{{pod}}", "range": true, "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "hide": false, - "refId": "C" } ], - "title": "容器内存延时诊断", + "title": "\u5bb9\u5668\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad", "type": "timeseries" }, { @@ -1424,8 +1808,8 @@ "links": [ { "targetBlank": true, - "title": "内存异常诊断中心", - "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&time=${__data.fields.Time}&diagnosis_type=内存高诊断" + "title": "\u5185\u5b58\u5f02\u5e38\u8bca\u65ad\u4e2d\u5fc3", + "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}\ufeff\ufeff\ufeff&time=\ufeff\ufeff\ufeff${__data.fields.Time}\ufeff\ufeff\ufeff&diagnosis_type=\u5185\u5b58\u9ad8\u8bca\u65ad" } ], "mappings": [], @@ -1455,10 +1839,10 @@ "overrides": [] }, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, - "y": 23 + "y": 31 }, "id": 72, "options": { @@ -1487,7 +1871,7 @@ "refId": "A" } ], - "title": "节点内存使用率诊断", + "title": "\u8282\u70b9\u5185\u5b58\u4f7f\u7528\u7387\u8bca\u65ad", "type": "timeseries" }, { @@ -1528,7 +1912,13 @@ "mode": "area" } }, - "links": [], + "links": [ + { + "targetBlank": true, + "title": "CPU\u4e89\u62a2\u8bca\u65ad\u4e2d\u5fc3", + "url": "../diagnose/cpu/cpuhigh?instance=${__field.labels.instance}&moment=${__data.fields.Time}" + } + ], "mappings": [], "thresholds": { "mode": "absolute", @@ -1579,7 +1969,7 @@ "h": 9, "w": 12, "x": 12, - "y": 23 + "y": 31 }, "id": 8, "options": { @@ -1611,12 +2001,12 @@ "refId": "B" } ], - "title": "节点CPU利用率诊断", + "title": "\u8282\u70b9CPU\u5229\u7528\u7387\u8bca\u65ad", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "各个节点的CPU延时诊断", + "description": "\u5404\u4e2a\u8282\u70b9\u7684CPU\u5ef6\u65f6\u8bca\u65ad", "fieldConfig": { "defaults": { "color": { @@ -1652,7 +2042,13 @@ "mode": "off" } }, - "links": [], + "links": [ + { + "targetBlank": true, + "title": "cpu\u4e89\u62a2\u8bca\u65ad\u4e2d\u5fc3", + "url": "../diagnose/cpu/cpuhigh?instance=${__field.labels.instance}&moment=${__data.fields.Time}" + } + ], "mappings": [], "thresholds": { "mode": "absolute", @@ -1675,7 +2071,7 @@ "h": 9, "w": 12, "x": 0, - "y": 32 + "y": 39 }, "id": 45, "options": { @@ -1707,7 +2103,7 @@ "refId": "A" } ], - "title": "节点CPU延时诊断", + "title": "\u8282\u70b9CPU\u5ef6\u65f6\u8bca\u65ad", "type": "timeseries" }, { @@ -1716,11 +2112,11 @@ "h": 1, "w": 24, "x": 0, - "y": 41 + "y": 48 }, "id": 58, "panels": [], - "title": "集群资源详情", + "title": "\u96c6\u7fa4\u8d44\u6e90\u8be6\u60c5", "type": "row" }, { @@ -1763,7 +2159,13 @@ "mode": "line+area" } }, - "links": [], + "links": [ + { + "targetBlank": true, + "title": "node cpu", + "url": "http://100.82.241.245:3000/d/rYdddlPWk/sysak_base?orgId=1&from=1685030400000&to=1685033999000" + } + ], "mappings": [], "thresholds": { "mode": "absolute", @@ -1819,7 +2221,7 @@ "h": 9, "w": 12, "x": 0, - "y": 42 + "y": 49 }, "id": 9, "options": { @@ -1850,7 +2252,7 @@ "refId": "A" } ], - "title": "集群平均 CPU利用率", + "title": "\u96c6\u7fa4\u5e73\u5747 CPU\u5229\u7528\u7387", "type": "timeseries" }, { @@ -1911,7 +2313,7 @@ "h": 9, "w": 12, "x": 12, - "y": 42 + "y": 49 }, "id": 48, "options": { @@ -1938,7 +2340,7 @@ "refId": "A" } ], - "title": "集群平均内存使用率", + "title": "\u96c6\u7fa4\u5e73\u5747\u5185\u5b58\u4f7f\u7528\u7387", "type": "timeseries" }, { @@ -1999,7 +2401,7 @@ "h": 9, "w": 12, "x": 0, - "y": 51 + "y": 58 }, "id": 7, "options": { @@ -2074,12 +2476,12 @@ "refId": "E" } ], - "title": "集群平均CPU利用率分布情况", + "title": "\u96c6\u7fa4\u5e73\u5747CPU\u5229\u7528\u7387\u5206\u5e03\u60c5\u51b5", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "集群所有节点总内存使用情况", + "description": "\u96c6\u7fa4\u6240\u6709\u8282\u70b9\u603b\u5185\u5b58\u4f7f\u7528\u60c5\u51b5", "fieldConfig": { "defaults": { "color": { @@ -2137,7 +2539,7 @@ "h": 9, "w": 12, "x": 12, - "y": 51 + "y": 58 }, "id": 31, "options": { @@ -2202,7 +2604,7 @@ "refId": "D" } ], - "title": "总内存使用情况", + "title": "\u603b\u5185\u5b58\u4f7f\u7528\u60c5\u51b5", "type": "timeseries" }, { @@ -2280,7 +2682,7 @@ "h": 8, "w": 12, "x": 0, - "y": 60 + "y": 67 }, "id": 70, "options": { @@ -2305,12 +2707,12 @@ "editorMode": "code", "expr": "sum(rate(sysom_proc_schedstat{value=\"delay\"}[$__rate_interval])) / count(sysom_proc_schedstat{value=\"delay\"})", "hide": false, - "legendFormat": "调度延迟", + "legendFormat": "\u8c03\u5ea6\u5ef6\u8fdf", "range": true, "refId": "B" } ], - "title": "集群平均调度延迟", + "title": "\u96c6\u7fa4\u5e73\u5747\u8c03\u5ea6\u5ef6\u8fdf", "type": "timeseries" }, { @@ -2372,7 +2774,7 @@ "h": 8, "w": 12, "x": 12, - "y": 60 + "y": 67 }, "id": 33, "options": { @@ -2465,7 +2867,7 @@ "refId": "G" } ], - "title": "集群用户态内存使用情况", + "title": "\u96c6\u7fa4\u7528\u6237\u6001\u5185\u5b58\u4f7f\u7528\u60c5\u51b5", "type": "timeseries" }, { @@ -2542,7 +2944,7 @@ "h": 8, "w": 12, "x": 0, - "y": 68 + "y": 75 }, "id": 20, "options": { @@ -2573,7 +2975,7 @@ "refId": "A" } ], - "title": "集群节点平均load1", + "title": "\u96c6\u7fa4\u8282\u70b9\u5e73\u5747load1", "type": "timeseries" }, { @@ -2635,7 +3037,7 @@ "h": 8, "w": 12, "x": 12, - "y": 68 + "y": 75 }, "id": 40, "options": { @@ -2728,12 +3130,12 @@ "refId": "G" } ], - "title": "集群内核态内存使用情况", + "title": "\u96c6\u7fa4\u5185\u6838\u6001\u5185\u5b58\u4f7f\u7528\u60c5\u51b5", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "1 fork数量;2 处于IO阻塞任务数量", + "description": "1 fork\u6570\u91cf\uff1b2 \u5904\u4e8eIO\u963b\u585e\u4efb\u52a1\u6570\u91cf", "fieldConfig": { "defaults": { "color": { @@ -2806,7 +3208,7 @@ "h": 8, "w": 12, "x": 0, - "y": 76 + "y": 83 }, "id": 55, "options": { @@ -2848,7 +3250,7 @@ "refId": "B" } ], - "title": "集群节点任务统计信息", + "title": "\u96c6\u7fa4\u8282\u70b9\u4efb\u52a1\u7edf\u8ba1\u4fe1\u606f", "type": "timeseries" }, { @@ -2910,7 +3312,7 @@ "h": 8, "w": 12, "x": 12, - "y": 76 + "y": 83 }, "id": 39, "options": { @@ -2948,67 +3350,55 @@ "refId": "B" } ], - "title": "集群app与kernel内存对比", + "title": "\u96c6\u7fa4app\u4e0ekernel\u5185\u5b58\u5bf9\u6bd4", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "容器大盘", + "description": "\u5bb9\u5668\u5927\u76d8", "gridPos": { "h": 2, "w": 24, "x": 0, - "y": 84 + "y": 91 }, "id": 54, - "links": [ - { - "targetBlank": true, - "title": "容器大盘(链接)", - "url": "../monitor/container_monitor" - } - ], + "links": [], "options": { "code": { "language": "plaintext", "showLineNumbers": false, "showMiniMap": false }, - "content": "容器大盘详情", - "mode": "html" + "content": "[\u5bb9\u5668\u5927\u76d8\u8be6\u60c5](../grafana/d/rYdddlPWW/rong-qi-jian-kong?var-node=192.168.0.12:8889&var-podname=All&var-podns=All&orgId=1&refresh=5s)\n\n", + "mode": "markdown" }, "pluginVersion": "9.2.2", - "title": "容器大盘", + "title": "\u5bb9\u5668\u5927\u76d8", "type": "text" }, { "datasource": "sysom-prometheus", - "description": "节点大盘", + "description": "\u8282\u70b9\u5927\u76d8", "gridPos": { "h": 2, "w": 24, "x": 0, - "y": 86 + "y": 93 }, "id": 53, - "links": [ - { - "targetBlank": true, - "title": "节点大盘(链接)", - "url": "../monitor/node_monitor" - } - ], + "links": [], "options": { "code": { "language": "plaintext", "showLineNumbers": false, "showMiniMap": false }, - "content": "节点大盘详情", - "mode": "html" + "content": "[\u8282\u70b9\u5927\u76d8\u8be6\u60c5](../grafana/d/rYdddlPWk/sysom_base?orgId=1&refresh=5s)\n\n", + "mode": "markdown" }, "pluginVersion": "9.2.2", - "title": "节点大盘", + "title": "\u8282\u70b9\u5927\u76d8", "type": "text" } ], @@ -3018,6 +3408,30 @@ "tags": [], "templating": { "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": "sysom-prometheus", + "definition": "label_values(sysom_cluster_health_score, cluster)", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(sysom_cluster_health_score, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, { "current": { "selected": true, @@ -3082,9 +3496,9 @@ }, "timepicker": {}, "timezone": "", - "title": "集群视角", + "title": "\u96c6\u7fa4\u89c6\u89d2", "uid": "F4UBT8w4k", - "version": 9, + "version": 10, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-container-dashboard.json b/deps/4_grafana/sysom-container-dashboard.json index 45fa805cfa814ea4dc64c02cd818170687e76650..6e4b9ccf7d5a85e6d5c889366fb8557367fa3cbc 100644 --- a/deps/4_grafana/sysom-container-dashboard.json +++ b/deps/4_grafana/sysom-container-dashboard.json @@ -36,6 +36,1293 @@ "x": 0, "y": 0 }, + "id": 450, + "panels": [], + "title": "pod\u6982\u89c8", + "type": "row" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "#E24D42", + "value": 0 + }, + { + "color": "#EF843C", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 454, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\", type=\"total\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Health", + "type": "gauge" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 5, + "y": 1 + }, + "id": 458, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"error\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 20 + } + ], + "title": "Errors Health", + "type": "gauge" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 10, + "y": 1 + }, + "id": 460, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"latency\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 20 + } + ], + "title": "Latency Health", + "type": "gauge" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 15, + "y": 1 + }, + "id": 456, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"capacity\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 20 + } + ], + "title": "Saturation Health", + "type": "gauge" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 462, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"load\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A", + "step": 20 + } + ], + "title": "Load(Traffic) Health", + "type": "gauge" + }, + { + "datasource": "sysom-mysql", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + { + "id": "custom.width", + "value": 144 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u5f02\u5e38\u6307\u6807" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 123 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 472, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"capacity\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Saturation Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": true, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "datasource": "sysom-mysql", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + { + "id": "custom.width", + "value": 144 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u5f02\u5e38\u6307\u6807" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 123 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 6 + }, + "id": 473, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"load\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Load Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": true, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "datasource": "sysom-mysql", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + { + "id": "custom.width", + "value": 144 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u5f02\u5e38\u6307\u6807" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 123 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 6 + }, + "id": 474, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"latency\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Latency Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": true, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "datasource": "sysom-mysql", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } + }, + { + "id": "custom.width", + "value": 144 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "\u5f02\u5e38\u6307\u6807" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ + { + "id": "custom.width", + "value": 123 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 6 + }, + "id": 475, + "interval": "30s", + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-mysql", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"error\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Error Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": true, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, "id": 405, "panels": [], "title": "Pod Memory Monitor", @@ -100,7 +1387,182 @@ "h": 8, "w": 12, "x": 0, - "y": 1 + "y": 15 + }, + "id": 448, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "topk(5, sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"}) by (pod, value))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Memory Usage (top 5)", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 446, + "interval": "30", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "topk(5, sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"cache\"}) by (pod, value))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Cache Usage (top 5)", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 23 }, "id": 412, "options": { @@ -123,7 +1585,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"cache\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"cache\"}) by (pod, value)", "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "A" @@ -131,7 +1593,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"rss\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$podname\",value=\"rss\"}) by (pod,value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -140,7 +1602,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"shmem\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"shmem\"}) by (pod,value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -149,7 +1611,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"inactive_file\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"inactive_file\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -158,7 +1620,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"active_file\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"active_file\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -167,7 +1629,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"inactive_anon\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"inactive_anon\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -176,7 +1638,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"active_anon\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"active_file\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -185,7 +1647,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"usage\"} - ignoring(value)ysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"inactive_file\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"} - ignoring(value)sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"inactive_file\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-wss", "range": true, @@ -194,7 +1656,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memUtil{namespace=~\"$namespace\", pod=~\"$pod\",value=\"usage\"}", + "expr": "sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"}) by (pod, value)", "hide": false, "legendFormat": "{{pod}}-{{value}}", "range": true, @@ -260,12 +1722,13 @@ "overrides": [] }, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 12, - "y": 1 + "y": 23 }, "id": 407, + "interval": "30s", "options": { "legend": { "calcs": [ @@ -286,13 +1749,13 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_podmem{namespace=~\"$namespace\",pod=~\"$pod\",value=\"cached\"}", + "expr": "sysom_podmem{namespace=~\"$podns\",pod=~\"$pod\",value=\"cached\"}", "legendFormat": "{{pod}}-{{file}}", "range": true, "refId": "A" } ], - "title": "Pod Cache", + "title": "Pod Cached File (top 5)", "type": "timeseries" }, { @@ -351,13 +1814,13 @@ "overrides": [] }, "gridPos": { - "h": 7, - "w": 8, + "h": 9, + "w": 12, "x": 0, - "y": 9 + "y": 32 }, - "id": 411, - "interval": "30", + "id": 410, + "interval": "60s", "options": { "legend": { "calcs": [ @@ -378,40 +1841,40 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memgdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_0to1ms\"} + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1to5ms\"} + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_5to10ms\"}", - "legendFormat": "{{pod}}-memDrcm_glb_lat_1to10ms", + "expr": "sum(rate(sysom_container_memfail_cnt{namespace=~\"$podns\",pod=~\"$pod\",value=\"fail_cnt\"}[$__rate_interval])) by (pod, value)", + "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memgdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_10to100ms\"}", + "expr": "sum(rate(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"pgpgin\"}[$__rate_interval])) by (pod, value)", "hide": false, - "legendFormat": "{{pod}}-memDrcm_glb_lat_10to100ms", + "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memgdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_100to500ms\"}", + "expr": "sum(rate(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"pgpgout\"}[$__rate_interval])) by (pod, value)", "hide": false, - "legendFormat": "{{pod}}-memDrcm_glb_lat_100to500ms", + "legendFormat": "{{pod}}-{{value}}", "range": true, - "refId": "D" + "refId": "C" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_500to1000ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1000ms\"}", + "expr": "sum(rate(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"pgfault\"}[$__rate_interval])) by (pod, value)", "hide": false, - "legendFormat": "{{pod}}-memDrcm_glb_lat_500to1000ms", + "legendFormat": "{{pod}}-{{value}}", "range": true, - "refId": "C" + "refId": "D" } ], - "title": "Memory Global Direct Reclaim Latency", + "title": "Pod Mem Event", "type": "timeseries" }, { @@ -465,17 +1928,17 @@ } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 9 + "h": 9, + "w": 12, + "x": 12, + "y": 32 }, - "id": 409, + "id": 415, "options": { "legend": { "calcs": [ @@ -496,40 +1959,31 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_0to1ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_1to5ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_5to10ms\"}", - "legendFormat": "{{pod}}-memDrcm_lat_1to10ms", + "expr": "(sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"limit\"}) by (pod, container)) * 100", + "legendFormat": "{{pod}}-usage", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_10to100ms\"}", + "expr": "(sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"rss\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"}) by (pod, container)) * 100", "hide": false, - "legendFormat": "{{pod}}-memDrcm_lat_10to100ms", + "legendFormat": "{{pod}}-rss_ratio", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_100to500ms\"}", - "hide": false, - "legendFormat": "{{pod}}-memDrcm_lat_100to500ms", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_500to1000ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDrcm_lat_1000ms\"}", + "expr": "(sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"cache\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$podns\",pod=~\"$pod\",value=\"usage\"}) by (pod, container)) * 100", "hide": false, - "legendFormat": "{{pod}}-memDrcm_lat_500to1000ms", + "legendFormat": "{{pod}}-cache_ratio", "range": true, "refId": "C" } ], - "title": "Memory Direct Reclaim Latency", + "title": "Memory Rate", "type": "timeseries" }, { @@ -574,8 +2028,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -588,12 +2041,12 @@ "overrides": [] }, "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 9 + "h": 10, + "w": 12, + "x": 0, + "y": 41 }, - "id": 413, + "id": 411, "options": { "legend": { "calcs": [ @@ -614,40 +2067,40 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_1to5ms\"} + on(pod, container)sysom_container_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_5to10ms\"}", - "legendFormat": "{{pod}}-memDcmp_lat_1to10ms", + "expr": "clamp_min(sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_0to1ms\"} + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1to5ms\"} + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_5to10ms\"}) by (pod) - sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_0to1ms\"} offset 30s + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1to5ms\"} offset 30s + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_5to10ms\"} offset 30s) by (pod), 0)", + "legendFormat": "{{pod}}-memDrcm_glb_lat_1to10ms", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_10to100ms\"}", + "expr": "clamp_min(sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_10to100ms\"}) by (pod) - sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_10to100ms\"} offset 30s) by (pod), 0)", "hide": false, - "legendFormat": "{{pod}}-memDcmp_lat_10to100ms", + "legendFormat": "{{pod}}-memDrcm_glb_lat_10to100ms", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_cg_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_100to500ms\"}", + "expr": "clamp_min(sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_100to500ms\"}) by (pod) - sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_100to500ms\"} offset 30s) by (pod), 0)", "hide": false, - "legendFormat": "{{pod}}-memDcmp_lat_100to500ms", + "legendFormat": "{{pod}}-memDrcm_glb_lat_100to500ms", "range": true, "refId": "D" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_500to1000ms\"} + on(pod, container)sysom_container_memmcmp_latency{namespace=~\"$namespace\",pod=~\"$pod\",value=\"memDcmp_lat_1000ms\"}", + "expr": "clamp_min(sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_500to1000ms\"} + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1000ms\"}) by (pod) - sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_500to1000ms\"} offset 30s + on(pod, container)sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_glb_lat_1000ms\"} offset 30s) by (pod), 0)", "hide": false, - "legendFormat": "{{pod}}-memDcmp_lat_500to1000ms", + "legendFormat": "{{pod}}-memDrcm_glb_lat_500to1000ms", "range": true, "refId": "C" } ], - "title": "Memory Compact Latency", + "title": "Memory Global Direct Reclaim Latency", "type": "timeseries" }, { @@ -664,7 +2117,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 24, "gradientMode": "none", "hideFrom": { "legend": false, @@ -692,8 +2145,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -701,18 +2153,18 @@ } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 7, + "h": 10, "w": 12, - "x": 0, - "y": 16 + "x": 12, + "y": 41 }, - "id": 415, - "interval": "30s", + "id": 409, + "interval": "30", "options": { "legend": { "calcs": [ @@ -733,31 +2185,40 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "(sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"usage\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"limit\"}) by (pod, container)) * 100", - "legendFormat": "{{pod}}-usage/limit", + "expr": "clamp_min(sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_0to1ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_1to5ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_5to10ms\"}) by (pod) - sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_0to1ms\"} offset 30s + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_1to5ms\"} offset 30s + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_5to10ms\"} offset 30s) by (pod), 0)", + "legendFormat": "{{pod}}-memDrcm_lat_1to10ms", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "(sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"rss\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"usage\"}) by (pod, container)) * 100", + "expr": "clamp_min(sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_10to100ms\"}) by (pod) - sum(sysom_container_memgdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_10to100ms\"} offset 30s) by (pod), 0)", "hide": false, - "legendFormat": "{{pod}}-rss-ratio", + "legendFormat": "{{pod}}-memDrcm_lat_10to100ms", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "(sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"cache\"}) by (pod, container) / sum(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"usage\"}) by (pod, container)) * 100", + "expr": "clamp_min(sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_100to500ms\"}) by (pod) - sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_100to500ms\"} offset 30s) by (pod), 0)", + "hide": false, + "legendFormat": "{{pod}}-memDrcm_lat_100to500ms", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "clamp_min(sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_500to1000ms\"} + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_1000ms\"}) by (pod) - sum(sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_500to1000ms\"} offset 30s + on(pod, container)sysom_container_memdrcm_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDrcm_lat_1000ms\"} offset 30s) by (pod), 0)", "hide": false, - "legendFormat": "{{pod}}-cache-ratio", + "legendFormat": "{{pod}}-memDrcm_lat_500to1000ms", "range": true, "refId": "C" } ], - "title": "Memory Rate", + "title": "Memory Direct Reclaim Latency", "type": "timeseries" }, { @@ -802,8 +2263,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -816,13 +2276,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 10, "w": 12, - "x": 12, - "y": 16 + "x": 0, + "y": 51 }, - "id": 410, - "interval": "30s", + "id": 413, "options": { "legend": { "calcs": [ @@ -843,40 +2302,40 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_container_memfail_cnt{namespace=~\"$namespace\",pod=~\"$pod\",value=\"fail_cnt\"}[$__rate_interval])", - "legendFormat": "{{pod}}-{{value}}", + "expr": "clamp_min(sum(sysom_container_memmcmp_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDcmp_lat_1to5ms\"} + on(pod, container)sysom_container_memmcmp_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDcmp_lat_5to10ms\"}) by (pod) - sum(sysom_container_memmcmp_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDcmp_lat_1to5ms\"} offset 30s + on(pod, container)sysom_container_memmcmp_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDcmp_lat_5to10ms\"} offset 30s) by (pod), 0)", + "legendFormat": "{{pod}}-memDcmp_lat_1to10ms", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"pgpgin\"}[$__rate_interval])", + "expr": "sysom_container_memmcmp_latency{namespace=~\"$podns\",pod=~\"$pod\",value=\"memDcmp_lat_10to100ms\"}", "hide": false, - "legendFormat": "{{pod}}-{{value}}", + "legendFormat": "{{pod}}-memDcmp_lat_10to100ms", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"pgpgout\"}[$__rate_interval])", + "expr": "sysom_cg_memmcmp_latency{podns=~\"$podns\",podname=~\"$podname\",value=\"memDcmp_lat_100to500ms\"}", "hide": false, - "legendFormat": "{{pod}}-{{value}}", + "legendFormat": "{{pod}}-memDcmp_lat_100to500ms", "range": true, - "refId": "C" + "refId": "D" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_container_memUtil{namespace=~\"$namespace\",pod=~\"$pod\",value=\"pgfault\"}[$__rate_interval])", + "expr": "sysom_cg_memmcmp_latency{podns=~\"$podns\",podname=~\"$podname\",value=\"memDcmp_lat_500to1000ms\"} + on(podname, container)sysom_cg_memmcmp_latency{podns=~\"$podns\",podname=~\"$podname\",value=\"memDcmp_lat_1000ms\"}", "hide": false, - "legendFormat": "{{pod}}-{{value}}", + "legendFormat": "{{podname}}-memDcmp_lat_500to1000ms", "range": true, - "refId": "D" + "refId": "C" } ], - "title": "Pod Mem Event", + "title": "Memory Compact Latency", "type": "timeseries" }, { @@ -885,7 +2344,7 @@ "h": 1, "w": 24, "x": 0, - "y": 23 + "y": 61 }, "id": 417, "panels": [], @@ -950,7 +2409,7 @@ "h": 8, "w": 12, "x": 0, - "y": 24 + "y": 62 }, "id": 419, "options": { @@ -973,26 +2432,26 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_cpuacct_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"total\"}", - "legendFormat": "{{podname}}-{{value}}", + "expr": "sum(sysom_container_cpuacct_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"total\"}) by (pod)", + "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_cpuacct_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"user\"}", + "expr": "sum(sysom_container_cpuacct_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"user\"}) by (pod)", "hide": false, - "legendFormat": "{{podname}}-{{value}}", + "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_cpuacct_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"system\"}", + "expr": "sum(sysom_container_cpuacct_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"system\"}) by (pod)", "hide": false, - "legendFormat": "{{podname}}-{{value}}", + "legendFormat": "{{pod}}-{{value}}", "range": true, "refId": "C" } @@ -1057,10 +2516,10 @@ "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 62 }, "id": 420, - "interval": "30s", + "interval": "60s", "options": { "legend": { "calcs": [ @@ -1081,13 +2540,13 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_container_cpu_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"nr_throttled\"}[$__rate_interval])", - "legendFormat": "{{podname}}-throttled", + "expr": "rate(sysom_container_cpu_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"nr_throttled\"}[$__rate_interval])", + "legendFormat": "{{pod}}-throttled", "range": true, "refId": "A" } ], - "title": "Pod CPU", + "title": "Pod CPU nr_throttled", "type": "timeseries" }, { @@ -1124,7 +2583,7 @@ "mode": "none" }, "thresholdsStyle": { - "mode": "off" + "mode": "dashed+area" } }, "mappings": [], @@ -1136,10 +2595,11 @@ }, { "color": "red", - "value": 80 + "value": 15 } ] - } + }, + "unit": "ms" }, "overrides": [] }, @@ -1147,7 +2607,7 @@ "h": 8, "w": 12, "x": 0, - "y": 32 + "y": 70 }, "id": 441, "interval": "60s", @@ -1167,8 +2627,8 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_cpuacct_wait_latency{pod=~\"$pod\", namespace=~\"$namespace\", value=\"wait_lat_total\"}", - "legendFormat": "{{pod}}-{{value}}", + "expr": "rate(sysom_container_cpuacct_wait_latency{value=\"wait_lat_total\"}[$__rate_interval])", + "legendFormat": "{{pod}}-quota_ratio", "range": true, "refId": "A" } @@ -1233,7 +2693,7 @@ "h": 8, "w": 12, "x": 12, - "y": 32 + "y": 70 }, "id": 442, "interval": "60s", @@ -1253,7 +2713,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_container_cfs_quota{value=\"quota_ratio\",namespace=~\"$namespace\",pod=~\"$pod\"}", + "expr": "sysom_container_cfs_quota{value=\"quota_ratio\",namespace=~\"$podns\",pod=~\"$pod\"}", "legendFormat": "{{pod}}-quota_ratio", "range": true, "refId": "A" @@ -1268,7 +2728,7 @@ "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 78 }, "id": 424, "panels": [], @@ -1333,7 +2793,7 @@ "h": 8, "w": 12, "x": 0, - "y": 41 + "y": 79 }, "id": 438, "interval": "60s", @@ -1353,7 +2813,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(rate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_tx_bytes\"}[5m])) by (pod)", + "expr": "sum(rate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_tx_bytes\"}[5m])) by (pod)", "legendFormat": "{{pod}}-transmit", "range": true, "refId": "A" @@ -1361,7 +2821,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(rate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_rx_bytes\"}[5m])) by (pod)", + "expr": "-sum(rate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_rx_bytes\"}[5m])) by (pod)", "hide": false, "legendFormat": "{{pod}}-receive", "range": true, @@ -1429,7 +2889,7 @@ "h": 8, "w": 12, "x": 12, - "y": 41 + "y": 79 }, "id": 428, "interval": "60s", @@ -1449,7 +2909,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(rate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_tx_packets\"}[5m])) by (pod)", + "expr": "sum(rate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_tx_packets\"}[5m])) by (pod)", "legendFormat": "{{pod}}-transmit", "range": true, "refId": "A" @@ -1457,7 +2917,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(rate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_rx_packets\"}[5m])) by (pod)", + "expr": "-sum(rate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_rx_packets\"}[5m])) by (pod)", "hide": false, "legendFormat": "{{pod}}-reads", "range": true, @@ -1524,7 +2984,7 @@ "h": 8, "w": 12, "x": 0, - "y": 49 + "y": 87 }, "id": 440, "interval": "60s", @@ -1544,7 +3004,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(irate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_tx_dropped\"}[5m])) by (pod) / sum(irate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_tx_packets\"}[5m])) by (pod)", + "expr": "sum(irate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_tx_dropped\"}[5m])) by (pod) / sum(irate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_tx_packets\"}[5m])) by (pod)", "legendFormat": "{{pod}}-tx-drop", "range": true, "refId": "A" @@ -1552,7 +3012,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(irate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_rx_dropped\"}[5m])) by (pod) / sum(irate(sysom_container_network_stat{namespace=~\"$namespace\",pod=~\"$pod\",value=\"net_rx_packets\"}[5m])) by (pod)", + "expr": "sum(irate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_rx_dropped\"}[5m])) by (pod) / sum(irate(sysom_container_network_stat{namespace=~\"$podns\",pod=~\"$pod\",value=\"net_rx_packets\"}[5m])) by (pod)", "hide": false, "legendFormat": "{{pod}}-rx-drop", "range": true, @@ -1568,7 +3028,7 @@ "h": 1, "w": 24, "x": 0, - "y": 57 + "y": 95 }, "id": 422, "panels": [], @@ -1633,7 +3093,7 @@ "h": 8, "w": 12, "x": 0, - "y": 58 + "y": 96 }, "id": 430, "interval": "60s", @@ -1653,7 +3113,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(rate(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\", pod=~\".+\", value=\"writes_service_bytes\"}[5m]) / (1024 * 1024)) by (device,pod)", + "expr": "sum(rate(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\", value=\"writes_service_bytes\"}[5m]) / (1024 * 1024)) by (device,pod)", "legendFormat": "{{pod}}-{{device}}-writes", "range": true, "refId": "A" @@ -1661,7 +3121,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(rate(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\", pod=~\".+\", value=\"reads_service_bytes\"}[5m]) / (1024 * 1024)) by (pod,device)", + "expr": "-sum(rate(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\", value=\"reads_service_bytes\"}[5m]) / (1024 * 1024)) by (pod,device)", "hide": false, "legendFormat": "{{pod}}-{{device}}-reads", "range": true, @@ -1730,7 +3190,7 @@ "h": 8, "w": 12, "x": 12, - "y": 58 + "y": 96 }, "id": 432, "interval": "60s", @@ -1750,7 +3210,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(rate(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\", value=\"writes_serviced\"}[5m])) by (device, pod)", + "expr": "sum(rate(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\", value=\"writes_serviced\"}[5m])) by (device, pod)", "legendFormat": "{{pod}}-{{device}}-writes", "range": true, "refId": "A" @@ -1758,20 +3218,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(rate(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\", value=\"reads_serviced\"}[5m])) by (device, pod)", + "expr": "-sum(rate(sysom_cg_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\", value=\"reads_serviced\"}[5m])) by (device, pod)", "hide": false, "legendFormat": "{{pod}}-{{device}}-reads", "range": true, "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "C" } ], "title": "Pod Writes/Reads IOs Rates", @@ -1835,7 +3286,7 @@ "h": 8, "w": 12, "x": 0, - "y": 66 + "y": 104 }, "id": 434, "interval": "60s", @@ -1855,7 +3306,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\",value=\"writes_bytes_queued\"}) by (device,pod)", + "expr": "sum(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\",value=\"writes_bytes_queued\"}) by (device,pod)", "legendFormat": "{{pod}}-{{device}}-writes", "range": true, "refId": "A" @@ -1863,7 +3314,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\", value=\"reads_bytes_queued\"}) by (device,pod)", + "expr": "-sum(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\",value=\"reads_bytes_queued\"} / (1024 * 1024)) by (device,pod)", "hide": false, "legendFormat": "{{pod}}-{{device}}-reads", "range": true, @@ -1932,7 +3383,7 @@ "h": 8, "w": 12, "x": 12, - "y": 66 + "y": 104 }, "id": 436, "interval": "60s", @@ -1952,17 +3403,17 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sum(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\",value=\"writes_wait_time\"} / 10000000) by (device,pod)", - "legendFormat": "{{pod}}-{{device}}-writes", + "expr": "sum(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\",value=\"writes_wait_time\"} / 1000000) by (device,pod)", + "legendFormat": "{{podname}}-{{device}}-writes", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "-sum(sysom_container_blkio_stat{namespace=~\"$namespace\", pod=~\"$pod\",value=\"reads_wait_time\"} / 1000000) by (device,pod)", + "expr": "-sum(sysom_container_blkio_stat{namespace=~\"$podns\", pod=~\"$pod\", pod=~\".+\",value=\"reads_wait_time\"} / 1000000) by (device,pod)", "hide": false, - "legendFormat": "{{pod}}-{{device}}-reads", + "legendFormat": "{{podname}}-{{device}}-reads", "range": true, "refId": "B" } @@ -1979,33 +3430,6 @@ ], "templating": { "list": [ - { - "current": { - "selected": false, - "text": "192.168.0.123", - "value": "192.168.0.123" - }, - "datasource": "sysom-prometheus", - "definition": "label_values(sysom_proc_meminfo, exported_instance)", - "hide": 2, - "includeAll": false, - "label": "Host:", - "multi": false, - "name": "node", - "options": [], - "query": { - "query": "label_values(sysom_proc_meminfo, exported_instance)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "current": { "selected": false, @@ -2013,15 +3437,15 @@ "value": "$__all" }, "datasource": "sysom-prometheus", - "definition": "label_values(sysom_container_memUtil,pod)", + "definition": "label_values(sysom_container_memory_oomcnt, namespace)", "hide": 0, "includeAll": true, - "label": "pod", + "label": "podns", "multi": true, - "name": "pod", + "name": "podns", "options": [], "query": { - "query": "label_values(sysom_container_memUtil,pod)", + "query": "label_values(sysom_container_memory_oomcnt, namespace)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -2032,24 +3456,20 @@ }, { "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "selected": false, + "text": "file-daemonset-fxkd9", + "value": "file-daemonset-fxkd9" }, "datasource": "sysom-prometheus", - "definition": "label_values(sysom_container_memUtil,namespace)", + "definition": "label_values(sysom_container_memory_oomcnt,pod)", "hide": 0, - "includeAll": true, - "label": "namespace", + "includeAll": false, + "label": "pod", "multi": true, - "name": "namespace", + "name": "pod", "options": [], "query": { - "query": "label_values(sysom_container_memUtil,namespace)", + "query": "label_values(sysom_container_memory_oomcnt,pod)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -2090,9 +3510,9 @@ ] }, "timezone": "browser", - "title": "容器监控", + "title": "\u5bb9\u5668\u76d1\u63a7", "uid": "rYdddlPWW", - "version": 23, + "version": 8, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-sysak-base-dashboard.json b/deps/4_grafana/sysom-sysak-base-dashboard.json index 2bae9ba6867bfc101e7b5008807c6129ee1e361e..fb3a0f8d70e9aa67a1074a82e59c537945e9e292 100644 --- a/deps/4_grafana/sysom-sysak-base-dashboard.json +++ b/deps/4_grafana/sysom-sysak-base-dashboard.json @@ -901,9 +901,9 @@ "x": 0, "y": 7 }, - "id": 347, + "id": 405, "panels": [], - "title": "System CPU and Schedule", + "title": "System Health", "type": "row" }, { @@ -912,92 +912,69 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } + "mode": "thresholds" }, - "links": [], "mappings": [], - "unit": "%" + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "#E24D42", + "value": 0 + }, + { + "color": "#EF843C", + "value": 60 + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "dark-green", + "value": 100 + } + ] + } }, "overrides": [] }, "gridPos": { - "h": 9, - "w": 8, + "h": 5, + "w": 5, "x": 0, "y": 8 }, - "id": 345, + "id": 407, "options": { - "displayLabels": [ - "percent", - "name" - ], - "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", + "orientation": "auto", "reduceOptions": { - "calcs": [], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": true }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"idle\"}", - "legendFormat": "idle", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"total\"}", + "legendFormat": "{{exported_instance}}", "range": true, "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"user\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"nice\"}", - "hide": false, - "legendFormat": "user", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", - "hide": false, - "legendFormat": "kernel", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"steal\"}", - "hide": false, - "legendFormat": "steal", - "range": true, - "refId": "D" } ], - "title": "CPU Graph", - "type": "piechart" + "title": "Node health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -1005,91 +982,90 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "semi-dark-red", + "value": 0 + }, + { + "color": "orange", + "value": 60 + }, + { + "color": "yellow", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] }, - "mappings": [], - "unit": "%" + "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 9, - "w": 8, - "x": 8, + "h": 5, + "w": 5, + "x": 5, "y": 8 }, - "id": 348, + "id": 409, + "links": [], + "maxDataPoints": 100, "options": { - "displayLabels": [ - "name", - "value" - ], - "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", + "orientation": "horizontal", "reduceOptions": { - "calcs": [], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"}", - "legendFormat": "sys", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"}", - "hide": false, - "legendFormat": "iowait", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"}", - "hide": false, - "legendFormat": "hardirq", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"error\"}", + "format": "time_series", "hide": false, - "legendFormat": "softirq", - "range": true, - "refId": "E" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "title": "Kernel Used CPU", - "type": "piechart" + "title": "Errors Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -1097,41 +1073,62 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "green", + "value": 100 + } + ] }, - "mappings": [], "unit": "none" }, "overrides": [] }, "gridPos": { - "h": 9, - "w": 8, - "x": 16, + "h": 5, + "w": 4, + "x": 10, "y": 8 }, - "id": 356, + "id": 411, + "links": [], + "maxDataPoints": 100, "options": { - "displayLabels": [ - "value", - "percent" - ], - "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", + "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" @@ -1139,48 +1136,27 @@ "fields": "", "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_cgroups{instance=\"$node\",type=\"num_cgroups\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"latency\"}", "format": "time_series", - "instant": true, + "hide": false, + "instant": false, "interval": "", - "legendFormat": "{{value}}", - "range": false, - "refId": "A" - } - ], - "title": "Cgroup Numbers", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "__name__": true, - "exported_instance": true, - "instance": true, - "job": true, - "type": true - }, - "indexByName": {}, - "renameByName": { - "Value": "Count", - "value": "Type" - } - } + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "type": "piechart" + "title": "Latency Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -1188,203 +1164,90 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } - }, - "mappings": [], + ], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null }, { "color": "red", - "value": 80 - } - ] - }, - "unit": "%" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ + "value": 0 + }, { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ + "color": "orange", + "value": 60 + }, { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ + "color": "#EAB839", + "value": 80 + }, { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "color": "green", + "value": 100 } ] - } - ] + }, + "unit": "none" + }, + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 17 + "h": 5, + "w": 5, + "x": 14, + "y": 8 }, - "id": 352, - "options": { - "legend": { + "id": 413, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ - "min", - "mean", - "max", "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"user\"}", - "hide": false, - "legendFormat": "user", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"}", - "hide": false, - "legendFormat": "sys", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"nice\"}", - "hide": false, - "legendFormat": "nice", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"idle\"}", - "legendFormat": "idle", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"}", - "hide": false, - "legendFormat": "iowait", - "range": true, - "refId": "E" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"}", - "hide": false, - "legendFormat": "hardirq", - "range": true, - "refId": "F" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", - "hide": false, - "legendFormat": "softirq", - "range": true, - "refId": "G" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"steal\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"capacity\"}", + "format": "time_series", "hide": false, - "legendFormat": "steal", - "range": true, - "refId": "H" + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "title": "CPU Used", - "type": "timeseries" + "title": "Saturation Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -1392,143 +1255,90 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" } - }, - "mappings": [], + ], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "blue", "value": null }, { "color": "red", - "value": 80 - } - ] - }, - "unit": "%" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ + "value": 0 + }, { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ + "color": "orange", + "value": 60 + }, { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ + "color": "#EAB839", + "value": 80 + }, { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "color": "green", + "value": 100 } ] - } - ] + }, + "unit": "none" + }, + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 17 + "h": 5, + "w": 5, + "x": 19, + "y": 8 }, - "id": 398, + "id": 415, + "links": [], + "maxDataPoints": 100, "options": { - "legend": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ - "min", - "mean", - "max", "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": false }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "100 - sysom_proc_cpus{instance=\"$node\",mode=\"idle\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"load\"}", + "format": "time_series", "hide": false, "instant": false, - "legendFormat": "{{cpu_name}} busy", - "range": true, - "refId": "A" + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 } ], - "title": "CPU Used", - "type": "timeseries" + "title": "Load(Traffic) Health", + "type": "gauge" }, { "datasource": "sysom-prometheus", @@ -1536,1292 +1346,1197 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "text", + "value": null }, { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ + "color": "#E24D42", + "value": 0 + }, { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "sys" - }, - "properties": [ + "color": "#EF843C", + "value": 60 + }, { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "hardirq" - }, - "properties": [ + "color": "#EAB839", + "value": 80 + }, { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "color": "dark-green", + "value": 100 } ] } - ] + }, + "overrides": [] }, "gridPos": { - "h": 11, - "w": 12, + "h": 5, + "w": 24, "x": 0, - "y": 28 + "y": 13 }, - "id": 351, + "id": 417, + "interval": "30s", "options": { - "legend": { + "orientation": "auto", + "reduceOptions": { "calcs": [ - "min", - "mean", - "max", "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showThresholdLabels": false, + "showThresholdMarkers": true }, "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load1\"}", - "legendFormat": "load 1m", + "expr": "sysom_pod_health_score{exported_instance=~\"$node\", type=\"total\"}", + "legendFormat": "{{pod}}", "range": true, "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load5\"}", - "hide": false, - "legendFormat": "load 5m", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load15\"}", - "hide": false, - "legendFormat": "load 15m", - "range": true, - "refId": "C" } ], - "title": "System Load", - "type": "timeseries" + "title": "Pod/Container Health", + "type": "gauge" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "align": "auto", + "displayMode": "auto", + "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "dark-red", + "value": null }, { "color": "red", - "value": 80 + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] - }, - "unit": "short" + } }, "overrides": [ { "matcher": { "id": "byName", - "options": "app" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", "value": { - "fixedColor": "blue", - "mode": "fixed" + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] } + }, + { + "id": "custom.width", + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "kernel" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } + "id": "custom.width", + "value": 147 } ] }, { "matcher": { "id": "byName", - "options": "free" + "options": "node" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 175 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 } ] } ] }, "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 28 + "h": 8, + "w": 6, + "x": 0, + "y": 18 }, - "id": 372, + "id": 427, + "interval": "30s", "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "expr": "sum by(value)(rate(sysom_interrupts{instance=\"$node\"}[$__rate_interval]))", - "hide": false, - "legendFormat": "{{value}}", - "range": true, - "refId": "A" + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"capacity\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Saturation Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], - "title": "Hardirqs Rate", - "type": "timeseries" + "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "align": "auto", + "displayMode": "auto", + "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "dark-red", + "value": null }, { "color": "red", - "value": 80 + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] - }, - "unit": "short" + } }, "overrides": [ { "matcher": { "id": "byName", - "options": "app" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", "value": { - "fixedColor": "blue", - "mode": "fixed" + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] } + }, + { + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "kernel" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } + "id": "custom.width", + "value": 150 } ] }, { "matcher": { "id": "byName", - "options": "free" + "options": "node" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 155 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 } ] } ] }, "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 39 + "h": 8, + "w": 6, + "x": 6, + "y": 18 }, - "id": 350, + "id": 428, + "interval": "30s", "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "expr": "rate(sysom_proc_sirq{instance=\"$node\"}[$__rate_interval])", - "hide": false, - "legendFormat": "{{type}}", - "range": true, - "refId": "A" + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"load\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Load Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], - "title": "Softirqs Rate", - "type": "timeseries" + "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "log": 10, - "type": "log" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "align": "auto", + "displayMode": "auto", + "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "dark-red", + "value": null }, { "color": "red", - "value": 80 + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] - }, - "unit": "none" + } }, "overrides": [ { "matcher": { "id": "byName", - "options": "app" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ + "id": "custom.displayMode", + "value": "lcd-gauge" + }, { - "id": "color", + "id": "color" + }, + { + "id": "thresholds", "value": { - "fixedColor": "yellow", - "mode": "fixed" + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] } + }, + { + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "free" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 150 } ] }, { "matcher": { "id": "byName", - "options": "sys" + "options": "node" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "hardirq" + "options": "pod" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 135 } ] } ] }, "gridPos": { - "h": 11, - "w": 12, + "h": 8, + "w": 6, "x": 12, - "y": 39 + "y": 18 }, - "id": 353, + "id": 440, + "interval": "30s", "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_stat_counters{instance=\"$node\",counter=\"ctxt\"} / 20", - "legendFormat": "Context Switches", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "expr": "sysom_proc_stat_counters{instance=\"$node\",counter=\"processes_forks\"} / 20", - "hide": false, - "legendFormat": "Forks", - "range": true, - "refId": "B" - }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"latency\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Latency Health", + "transformations": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"runq\"}", - "hide": false, - "legendFormat": "Running Threads", - "range": true, - "refId": "C" + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } }, { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"plit\"}", - "hide": false, - "legendFormat": "Total Threads", - "range": true, - "refId": "D" + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], - "title": "Context Switches / Forks", - "type": "timeseries" + "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "cgroup子系统周期内增长速率", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "align": "auto", + "displayMode": "auto", + "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green" + "color": "dark-red", + "value": null }, { "color": "red", - "value": 80 + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] - }, - "unit": "short" + } }, "overrides": [ { "matcher": { "id": "byName", - "options": "app" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", + "id": "custom.displayMode", + "value": "lcd-gauge" + }, + { + "id": "color" + }, + { + "id": "thresholds", "value": { - "fixedColor": "blue", - "mode": "fixed" + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 + } + ] } + }, + { + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "kernel" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } + "id": "custom.width", + "value": 150 } ] }, { "matcher": { "id": "byName", - "options": "free" + "options": "node" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "id": "custom.width", + "value": 155 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pod" + }, + "properties": [ + { + "id": "custom.width", + "value": 135 } ] } ] }, "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 50 + "h": 8, + "w": 6, + "x": 18, + "y": 18 }, - "id": 373, + "id": 441, + "interval": "30s", "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "expr": "delta(sysom_cgroups{type=\"num_cgroups\",instance=\"$node\"}[$__range])", - "hide": false, - "legendFormat": "{{value}}", - "range": true, - "refId": "A" - } - ], - "title": "Cgroup Numbers Increase Rate", - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "description": "关中断过长统计", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ + "format": "table", + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"error\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } + "parameters": [], + "type": "function" } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ + ], + "groupBy": [ { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } + "property": { + "type": "string" + }, + "type": "groupBy" } - ] + ], + "limit": 50 } - ] - }, - "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 50 - }, - "id": 396, - "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt50ms\",mod=\"irqoff\"}", - "legendFormat": "DelayTime in (50ms,100ms)", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt100ms\",mod=\"irqoff\"}", - "hide": false, - "legendFormat": "DelayTime in [100ms, 500ms)", - "range": true, - "refId": "B" - }, + ], + "title": "Errors Health", + "transformations": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt500ms\",mod=\"irqoff\"}", - "hide": false, - "legendFormat": "DelayTime in [500ms, 1s)", - "range": true, - "refId": "C" + "id": "organize", + "options": { + "excludeByName": { + "instance": false, + "namespace": true, + "value": false + }, + "indexByName": { + "instance": 1, + "metric_id": 0, + "namespace": 3, + "pod": 2, + "score": 4, + "value": 5 + }, + "renameByName": { + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" + } + } }, { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt1s\",mod=\"irqoff\"}", - "hide": false, - "legendFormat": "DelayTime >= 1s", - "range": true, - "refId": "D" + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], - "title": "IrqOff Count", - "type": "timeseries" + "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "发生长时间不调度的次数", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 347, + "panels": [ + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false } - } - ] + }, + "links": [], + "mappings": [], + "unit": "%" + }, + "overrides": [] }, - { - "matcher": { - "id": "byName", - "options": "kernel" + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 3 + }, + "id": 345, + "options": { + "displayLabels": [ + "percent", + "name" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] + "pieType": "pie", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } }, - { - "matcher": { - "id": "byName", - "options": "free" + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"idle\"}", + "legendFormat": "idle", + "range": true, + "refId": "A" }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 11, - "w": 12, - "x": 0, - "y": 61 - }, - "id": 354, - "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"user\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"nice\"}", + "hide": false, + "legendFormat": "user", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"} + on(instance)sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", + "hide": false, + "legendFormat": "kernel", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"steal\"}", + "hide": false, + "legendFormat": "steal", + "range": true, + "refId": "D" + } ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt50ms\",mod=\"noschd\"}", - "legendFormat": "DelayTime in (50ms,100ms)", - "range": true, - "refId": "A" + "title": "CPU Graph", + "type": "piechart" }, { "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt100ms\",mod=\"noschd\"}", - "hide": false, - "legendFormat": "DelayTime in [100ms, 500ms)", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt500ms\",mod=\"noschd\"}", - "hide": false, - "legendFormat": "DelayTime in [500ms, 1s)", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt1s\",mod=\"noschd\"}", - "hide": false, - "legendFormat": "DelayTime >= 1s", - "range": true, - "refId": "D" - } - ], - "title": "NoSched Count", - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "description": "在就绪队列中等待的时间", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMax": 4, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 - } - ] + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "%" + }, + "overrides": [] }, - "unit": "ns" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 3 + }, + "id": 348, + "options": { + "displayLabels": [ + "name", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] + "pieType": "pie", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } }, - { - "matcher": { - "id": "byName", - "options": "kernel" + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"}", + "legendFormat": "sys", + "range": true, + "refId": "A" }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"}", + "hide": false, + "legendFormat": "iowait", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"}", + "hide": false, + "legendFormat": "hardirq", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", + "hide": false, + "legendFormat": "softirq", + "range": true, + "refId": "E" + } + ], + "title": "Kernel Used CPU", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false } - } - ] + }, + "mappings": [], + "unit": "none" + }, + "overrides": [] }, - { - "matcher": { - "id": "byName", - "options": "free" + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 3 + }, + "id": 356, + "options": { + "displayLabels": [ + "value", + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_cgroups{instance=\"$node\",type=\"num_cgroups\"}", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "{{value}}", + "range": false, + "refId": "A" + } + ], + "title": "Cgroup Numbers", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "exported_instance": true, + "instance": true, + "job": true, + "type": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Count", + "value": "Type" } } - ] - } - ] - }, - "gridPos": { - "h": 11, - "w": 12, - "x": 12, - "y": 61 - }, - "id": 355, - "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + } ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "type": "piechart" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_schedstat{instance=\"$node\",value=\"delay\"}[$__rate_interval])", - "legendFormat": "{{cpu}} WakeUp2Sched Delay", - "range": true, - "refId": "A" - } - ], - "title": "WaitOnRunq Delay", - "type": "timeseries" - }, - { - "collapsed": true, - "datasource": "sysom-prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 72 - }, - "id": 266, - "panels": [ { "datasource": "sysom-prometheus", "description": "", @@ -2830,29 +2545,2419 @@ "color": { "mode": "palette-classic" }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "%" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 352, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"user\"}", + "hide": false, + "legendFormat": "user", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"sys\"}", + "hide": false, + "legendFormat": "sys", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"nice\"}", + "hide": false, + "legendFormat": "nice", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"idle\"}", + "legendFormat": "idle", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"iowait\"}", + "hide": false, + "legendFormat": "iowait", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"hardirq\"}", + "hide": false, + "legendFormat": "hardirq", + "range": true, + "refId": "F" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"softirq\"}", + "hide": false, + "legendFormat": "softirq", + "range": true, + "refId": "G" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_cpu_total{instance=\"$node\",mode=\"steal\"}", + "hide": false, + "legendFormat": "steal", + "range": true, + "refId": "H" + } + ], + "title": "CPU Used", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "%" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 398, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "100 - sysom_proc_cpus{instance=\"$node\",mode=\"idle\"}", + "hide": false, + "instant": false, + "legendFormat": "{{cpu_name}} busy", + "range": true, + "refId": "A" + } + ], + "title": "CPU Used", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "sys" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "hardirq" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 351, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load1\"}", + "legendFormat": "load 1m", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load5\"}", + "hide": false, + "legendFormat": "load 5m", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"load15\"}", + "hide": false, + "legendFormat": "load 15m", + "range": true, + "refId": "C" + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 372, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sum by(value)(rate(sysom_interrupts{instance=\"$node\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "{{value}}", + "range": true, + "refId": "A" + } + ], + "title": "Hardirqs Rate", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 350, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_sirq{instance=\"$node\"}[$__rate_interval])", + "hide": false, + "legendFormat": "{{type}}", + "range": true, + "refId": "A" + } + ], + "title": "Softirqs Rate", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "sys" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "hardirq" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 353, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_stat_counters{instance=\"$node\",counter=\"ctxt\"} / 20", + "legendFormat": "Context Switches", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_stat_counters{instance=\"$node\",counter=\"processes_forks\"} / 20", + "hide": false, + "legendFormat": "Forks", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"runq\"}", + "hide": false, + "legendFormat": "Running Threads", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_loadavg{instance=\"$node\",value=\"plit\"}", + "hide": false, + "legendFormat": "Total Threads", + "range": true, + "refId": "D" + } + ], + "title": "Context Switches / Forks", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "cgroup\u5b50\u7cfb\u7edf\u5468\u671f\u5185\u589e\u957f\u901f\u7387", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 373, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "delta(sysom_cgroups{type=\"num_cgroups\",instance=\"$node\"}[$__range])", + "hide": false, + "legendFormat": "{{value}}", + "range": true, + "refId": "A" + } + ], + "title": "Cgroup Numbers Increase Rate", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5173\u4e2d\u65ad\u8fc7\u957f\u7edf\u8ba1", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 396, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt50ms\",mod=\"irqoff\"}", + "legendFormat": "DelayTime in (50ms,100ms)", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt100ms\",mod=\"irqoff\"}", + "hide": false, + "legendFormat": "DelayTime in [100ms, 500ms)", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt500ms\",mod=\"irqoff\"}", + "hide": false, + "legendFormat": "DelayTime in [500ms, 1s)", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt1s\",mod=\"irqoff\"}", + "hide": false, + "legendFormat": "DelayTime >= 1s", + "range": true, + "refId": "D" + } + ], + "title": "IrqOff Count", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u53d1\u751f\u957f\u65f6\u95f4\u4e0d\u8c03\u5ea6\u7684\u6b21\u6570", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 354, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt50ms\",mod=\"noschd\"}", + "legendFormat": "DelayTime in (50ms,100ms)", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt100ms\",mod=\"noschd\"}", + "hide": false, + "legendFormat": "DelayTime in [100ms, 500ms)", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt500ms\",mod=\"noschd\"}", + "hide": false, + "legendFormat": "DelayTime in [500ms, 1s)", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sched_moni_jitter{instance=\"$node\",value=\"gt1s\",mod=\"noschd\"}", + "hide": false, + "legendFormat": "DelayTime >= 1s", + "range": true, + "refId": "D" + } + ], + "title": "NoSched Count", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5728\u5c31\u7eea\u961f\u5217\u4e2d\u7b49\u5f85\u7684\u65f6\u95f4", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 4, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ns" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 355, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_schedstat{instance=\"$node\",value=\"delay\"}[$__rate_interval])", + "legendFormat": "{{cpu}} WakeUp2Sched Delay", + "range": true, + "refId": "A" + } + ], + "title": "WaitOnRunq Delay", + "type": "timeseries" + } + ], + "title": "System CPU and Schedule", + "type": "row" + }, + { + "collapsed": true, + "datasource": "sysom-prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 266, + "panels": [ + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 2, + "mappings": [], + "min": -4, + "unit": "kbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel reserved" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 3 + }, + "id": 333, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_reserved\"}", + "legendFormat": "system reserved", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"MemFree\"}", + "hide": false, + "legendFormat": "free", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_used\"}", + "hide": false, + "legendFormat": "app use", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_used\"} - on(instance)sysom_proc_meminfo{value=\"kernel_reserved\"}", + "hide": false, + "legendFormat": "kernel use", + "range": true, + "refId": "D" + } + ], + "title": "Memory Graph", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "kbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 3 + }, + "id": 334, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"VmallocUsed\"}", + "hide": false, + "legendFormat": "VmallocUsed", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SReclaimable\"}", + "hide": false, + "legendFormat": "SReclaimable", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SUnreclaim\"}", + "hide": false, + "legendFormat": "SUnreclaim", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"PageTables\"}", + "hide": false, + "legendFormat": "PageTables", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"alloc_page\"}", + "hide": false, + "legendFormat": "alloc_page", + "range": true, + "refId": "F" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"KernelStack\"}", + "hide": false, + "legendFormat": "KernelStack", + "range": true, + "refId": "G" + } + ], + "title": "Kernel Used Memory", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "kbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 3 + }, + "id": 335, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_filecache\"}", + "legendFormat": "filecache", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_anon\"}", + "hide": false, + "legendFormat": "anon", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_mlock\"}", + "hide": false, + "legendFormat": "mlock", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_buffers\"}", + "hide": false, + "legendFormat": "buffers", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_shmem\"}", + "hide": false, + "legendFormat": "shmem", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_1G\"}", + "hide": false, + "legendFormat": "huge_1G", + "range": true, + "refId": "F" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_2M\"}", + "hide": false, + "legendFormat": "huge_2M", + "range": true, + "refId": "G" + } + ], + "title": "User Used Memory", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5185\u5b58\u4f7f\u7528\u7387", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "kbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 331, + "links": [], + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "#node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"total\",instance=\"$node\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "#node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"total\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"MemFree\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"Cached\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"Buffers\",instance=\"$node\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{value=\"MemFree\",instance=\"$node\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Free", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{value=\"Cached\",instance=\"$node\"} + on(instance)sysom_proc_meminfo{value=\"Buffers\",instance=\"$node\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Buffers+Cached", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u5185\u5b58\u4f19\u4f34\u7cfb\u7edf\u60c5\u51b5", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] }, - "decimals": 2, - "mappings": [], - "min": -4, - "unit": "kbytes" - }, - "overrides": [ { "matcher": { "id": "byName", - "options": "app" + "options": "Swap" }, "properties": [ { "id": "color", "value": { - "fixedColor": "blue", + "fixedColor": "#BF1B00", "mode": "fixed" } } @@ -2861,13 +4966,13 @@ { "matcher": { "id": "byName", - "options": "kernel" + "options": "Swap_Cache" }, "properties": [ { "id": "color", "value": { - "fixedColor": "yellow", + "fixedColor": "#C15C17", "mode": "fixed" } } @@ -2876,13 +4981,13 @@ { "matcher": { "id": "byName", - "options": "free" + "options": "Swap_Free" }, "properties": [ { "id": "color", "value": { - "fixedColor": "green", + "fixedColor": "#2F575E", "mode": "fixed" } } @@ -2891,204 +4996,200 @@ { "matcher": { "id": "byName", - "options": "kernel reserved" + "options": "Unused" }, "properties": [ { "id": "color", "value": { - "fixedColor": "red", + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*CommitLimit - *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", "mode": "fixed" } + }, + { + "id": "custom.fillOpacity", + "value": 0 } ] } ] }, "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 3 + "h": 10, + "w": 12, + "x": 12, + "y": 11 }, - "id": 333, + "id": 135, + "links": [], "options": { - "displayLabels": [ - "percent", - "value" - ], "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { "calcs": [ + "min", + "mean", + "max", "lastNotNull" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_reserved\"}", - "legendFormat": "system reserved", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo0\",instance=\"$node\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "4K", "range": true, - "refId": "A" + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"MemFree\"}", - "hide": false, - "legendFormat": "free", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo1\",instance=\"$node\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "8K", "range": true, - "refId": "B" + "refId": "B", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_used\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo2\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "app use", + "intervalFactor": 1, + "legendFormat": "16K", "range": true, - "refId": "C" + "refId": "D", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_used\"} - on(instance)sysom_proc_meminfo{value=\"kernel_reserved\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo3\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "kernel use", + "intervalFactor": 1, + "legendFormat": "32K", "range": true, - "refId": "D" - } - ], - "title": "Memory Graph", - "type": "piechart" - }, - { - "datasource": "sysom-prometheus", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [], - "unit": "kbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 3 - }, - "id": 334, - "options": { - "displayLabels": [ - "percent", - "value" - ], - "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] + "refId": "E", + "step": 240 }, - "pieType": "pie", - "reduceOptions": { - "calcs": [], - "fields": "", - "values": false + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo4\",instance=\"$node\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "64K", + "range": true, + "refId": "F", + "step": 240 }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"VmallocUsed\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo5\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "VmallocUsed", + "intervalFactor": 1, + "legendFormat": "128K", "range": true, - "refId": "B" + "refId": "G", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SReclaimable\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo6\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "SReclaimable", + "intervalFactor": 1, + "legendFormat": "256K", "range": true, - "refId": "C" + "refId": "H", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SUnreclaim\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo7\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "SUnreclaim", + "intervalFactor": 1, + "legendFormat": "512K", "range": true, - "refId": "D" + "refId": "I", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"PageTables\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo8\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "PageTables", + "intervalFactor": 1, + "legendFormat": "1M", "range": true, - "refId": "E" + "refId": "J", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"alloc_page\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo9\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "alloc_page", + "intervalFactor": 1, + "legendFormat": "2M", "range": true, - "refId": "F" + "refId": "K", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"KernelStack\"}", + "expr": "sysom_proc_buddyinfo{value=\"buddyinfo10\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "KernelStack", + "intervalFactor": 1, + "legendFormat": "4M", "range": true, - "refId": "G" + "refId": "L", + "step": 240 } ], - "title": "Kernel Used Memory", - "type": "piechart" + "title": "Memory BuddyInfo", + "type": "timeseries" }, { "datasource": "sysom-prometheus", @@ -3099,42 +5200,66 @@ "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, "unit": "kbytes" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 3 + "h": 10, + "w": 12, + "x": 0, + "y": 21 }, - "id": 335, + "id": 336, "options": { - "displayLabels": [ - "percent", - "value" - ], "legend": { + "calcs": [ + "min", + "mean", + "max", + "lastNotNull" + ], "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [], - "fields": "", - "values": false + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -3146,72 +5271,73 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_filecache\"}", - "legendFormat": "filecache", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_reserved\"}", + "hide": false, + "legendFormat": "reserved", "range": true, "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_anon\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"VmallocUsed\"}", "hide": false, - "legendFormat": "anon", + "legendFormat": "VmallocUsed", "range": true, "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_mlock\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SReclaimable\"}", "hide": false, - "legendFormat": "mlock", + "legendFormat": "SReclaimable", "range": true, "refId": "C" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_buffers\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SUnreclaim\"}", "hide": false, - "legendFormat": "buffers", + "legendFormat": "SUnreclaim", "range": true, "refId": "D" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_shmem\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"PageTables\"}", "hide": false, - "legendFormat": "shmem", + "legendFormat": "PageTables", "range": true, "refId": "E" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_1G\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"alloc_page\"}", "hide": false, - "legendFormat": "huge_1G", + "legendFormat": "alloc_page", "range": true, "refId": "F" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_2M\"}", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"KernelStack\"}", "hide": false, - "legendFormat": "huge_2M", + "legendFormat": "KernelStack", "range": true, "refId": "G" } ], - "title": "User Used Memory", - "type": "piechart" + "title": "Kernel Used Memory", + "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "内存使用率", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -3221,7 +5347,7 @@ "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", - "axisPlacement": "left", + "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, @@ -3237,7 +5363,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -3247,9 +5373,7 @@ "mode": "off" } }, - "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -3260,16 +5384,61 @@ }, "unit": "kbytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "app" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "kernel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 10, "w": 12, - "x": 0, - "y": 11 + "x": 12, + "y": 21 }, - "id": 331, - "links": [], + "id": 337, "options": { "legend": { "calcs": [ @@ -3280,69 +5449,84 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 350 + "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "#node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"total\",instance=\"$node\"}\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Total", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_filecache\"}", + "legendFormat": "filecache", "range": true, - "refId": "A", - "step": 240 + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "#node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"total\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"MemFree\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"Cached\",instance=\"$node\"} - on(instance)sysom_proc_meminfo{value=\"Buffers\",instance=\"$node\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Used", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_anon\"}", + "hide": false, + "legendFormat": "anon", "range": true, - "refId": "B", - "step": 240 + "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{value=\"MemFree\",instance=\"$node\"}", - "format": "time_series", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_mlock\"}", "hide": false, - "intervalFactor": 1, - "legendFormat": "Free", + "legendFormat": "mlock", "range": true, - "refId": "C", - "step": 240 + "refId": "C" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{value=\"Cached\",instance=\"$node\"} + on(instance)sysom_proc_meminfo{value=\"Buffers\",instance=\"$node\"}", - "format": "time_series", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_buffers\"}", "hide": false, - "intervalFactor": 1, - "legendFormat": "Buffers+Cached", + "legendFormat": "buffers", "range": true, - "refId": "D", - "step": 240 + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_shmem\"}", + "hide": false, + "legendFormat": "shmem", + "range": true, + "refId": "E" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_1G\"}", + "hide": false, + "legendFormat": "huge_1G", + "range": true, + "refId": "F" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_2M\"}", + "hide": false, + "legendFormat": "huge_2M", + "range": true, + "refId": "G" } ], - "title": "Memory Usage", + "title": "User Used Memory", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "内存伙伴系统情况", + "description": "THP\u7533\u8bf7\u9891\u7387", "fieldConfig": { "defaults": { "color": { @@ -3646,27 +5830,8 @@ "id": "color", "value": { "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*CommitLimit - *./" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 + "mode": "fixed" + } } ] } @@ -3675,10 +5840,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 11 + "x": 0, + "y": 31 }, - "id": 135, + "id": 191, "links": [], "options": { "legend": { @@ -3703,140 +5868,70 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo0\",instance=\"$node\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "4K", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo1\",instance=\"$node\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "8K", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo2\",instance=\"$node\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "16K", - "range": true, - "refId": "D", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo3\",instance=\"$node\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "32K", - "range": true, - "refId": "E", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo4\",instance=\"$node\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "64K", - "range": true, - "refId": "F", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo5\",instance=\"$node\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "128K", - "range": true, - "refId": "G", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo6\",instance=\"$node\"}", + "expr": "rate(sysom_proc_vmstat{value=\"thp_fault_alloc\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "256K", + "legendFormat": "THP Fault Alloc Times", "range": true, - "refId": "H", + "refId": "A", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo7\",instance=\"$node\"}", + "expr": "rate(sysom_proc_vmstat{value=\"thp_fault_fallback\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "512K", + "legendFormat": "THP Fault Fallback", "range": true, - "refId": "I", + "refId": "B", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo8\",instance=\"$node\"}", + "expr": "rate(sysom_proc_vmstat{value=\"thp_collapse_alloc\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "1M", + "legendFormat": "THP Collapse Alloc", "range": true, - "refId": "J", + "refId": "C", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo9\",instance=\"$node\"}", + "expr": "rate(sysom_proc_vmstat{value=\"thp_collapse_alloc_failed\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "2M", + "legendFormat": "THP Collapse Alloc Fail", "range": true, - "refId": "K", + "refId": "D", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_buddyinfo{value=\"buddyinfo10\",instance=\"$node\"}", + "expr": "rate(sysom_proc_meminfo{value=\"AnonHugePages\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "4M", + "legendFormat": "THP Page Count", "range": true, - "refId": "L", + "refId": "E", "step": 240 } ], - "title": "Memory BuddyInfo", + "title": "THP Alloc Rate", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "\u53ef\u56de\u6536\u548c\u4e0d\u53ef\u56de\u6536Slab\u5185\u5b58\u5927\u5c0f", "fieldConfig": { "defaults": { "color": { @@ -3862,151 +5957,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - }, - "unit": "kbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 21 - }, - "id": 336, - "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"kernel_reserved\"}", - "hide": false, - "legendFormat": "reserved", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"VmallocUsed\"}", - "hide": false, - "legendFormat": "VmallocUsed", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SReclaimable\"}", - "hide": false, - "legendFormat": "SReclaimable", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"SUnreclaim\"}", - "hide": false, - "legendFormat": "SUnreclaim", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"PageTables\"}", - "hide": false, - "legendFormat": "PageTables", - "range": true, - "refId": "E" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"alloc_page\"}", - "hide": false, - "legendFormat": "alloc_page", - "range": true, - "refId": "F" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"KernelStack\"}", - "hide": false, - "legendFormat": "KernelStack", - "range": true, - "refId": "G" - } - ], - "title": "Kernel Used Memory", - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -4016,7 +5967,9 @@ "mode": "off" } }, + "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -4027,61 +5980,16 @@ }, "unit": "kbytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "kernel" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "yellow", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 10, "w": 12, "x": 12, - "y": 21 + "y": 31 }, - "id": 337, + "id": 136, + "links": [], "options": { "legend": { "calcs": [ @@ -4092,84 +6000,46 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "width": 350 }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_filecache\"}", - "legendFormat": "filecache", - "range": true, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_anon\"}", - "hide": false, - "legendFormat": "anon", - "range": true, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_mlock\"}", - "hide": false, - "legendFormat": "mlock", - "range": true, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_buffers\"}", - "hide": false, - "legendFormat": "buffers", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_shmem\"}", - "hide": false, - "legendFormat": "shmem", - "range": true, - "refId": "E" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_1G\"}", - "hide": false, - "legendFormat": "huge_1G", + "expr": "#node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"SReclaimable\",instance=\"$node\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reclaimable", "range": true, - "refId": "F" + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{instance=\"$node\",value=\"user_huge_2M\"}", + "expr": "sysom_proc_meminfo{value=\"SUnreclaim\",instance=\"$node\"}", + "format": "time_series", "hide": false, - "legendFormat": "huge_2M", + "intervalFactor": 1, + "legendFormat": "Unreclaim", "range": true, - "refId": "G" + "refId": "D", + "step": 240 } ], - "title": "User Used Memory", + "title": "Memory Slab", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "THP申请频率", + "description": "Page\u6362\u8fdb\u6362\u51fa\u9891\u7387", "fieldConfig": { "defaults": { "color": { @@ -4477,6 +6347,30 @@ } } ] + }, + { + "matcher": { + "id": "byName", + "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] } ] }, @@ -4484,9 +6378,9 @@ "h": 10, "w": 12, "x": 0, - "y": 31 + "y": 41 }, - "id": 191, + "id": 138, "links": [], "options": { "legend": { @@ -4511,11 +6405,10 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"thp_fault_alloc\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{value=\"pgpgin\",instance=\"$node\"}[1m])", "format": "time_series", - "hide": false, "intervalFactor": 1, - "legendFormat": "THP Fault Alloc Times", + "legendFormat": "Page In", "range": true, "refId": "A", "step": 240 @@ -4523,11 +6416,10 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"thp_fault_fallback\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{value=\"pgpgout\",instance=\"$node\"}[1m])", "format": "time_series", - "hide": false, "intervalFactor": 1, - "legendFormat": "THP Fault Fallback", + "legendFormat": "Page Out", "range": true, "refId": "B", "step": 240 @@ -4535,11 +6427,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"thp_collapse_alloc\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{value=\"pswpin\",instance=\"$node\"}[1m])", "format": "time_series", - "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "THP Collapse Alloc", + "legendFormat": "pswpin", "range": true, "refId": "C", "step": 240 @@ -4547,34 +6439,22 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"thp_collapse_alloc_failed\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{value=\"pswpout\",instance=\"$node\"}[1m])", "format": "time_series", - "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "THP Collapse Alloc Fail", + "legendFormat": "pswpout", "range": true, "refId": "D", "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_meminfo{value=\"AnonHugePages\",instance=\"$node\"}[1m])", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "THP Page Count", - "range": true, - "refId": "E", - "step": 240 } ], - "title": "THP Alloc Rate", + "title": "Page In/Out", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "可回收和不可回收Slab内存大小", + "description": "PageFault\u9891\u7387", "fieldConfig": { "defaults": { "color": { @@ -4584,7 +6464,7 @@ "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", - "axisPlacement": "left", + "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, @@ -4595,12 +6475,15 @@ "viz": false }, "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -4621,7 +6504,7 @@ } ] }, - "unit": "kbytes" + "unit": "short" }, "overrides": [] }, @@ -4629,9 +6512,9 @@ "h": 10, "w": 12, "x": 12, - "y": 31 + "y": 41 }, - "id": 136, + "id": 131, "links": [], "options": { "legend": { @@ -4643,11 +6526,10 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 350 + "showLegend": true }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, @@ -4656,10 +6538,10 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "#node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}\nsysom_proc_meminfo{value=\"SReclaimable\",instance=\"$node\"}\n", + "expr": "rate(sysom_proc_vmstat{value=\"pgfault\",instance=\"$node\"}[1m])", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Reclaimable", + "legendFormat": "PageFault", "range": true, "refId": "A", "step": 240 @@ -4667,22 +6549,33 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_meminfo{value=\"SUnreclaim\",instance=\"$node\"}", + "expr": "rate(sysom_proc_vmstat{value=\"pgmajfault\",instance=\"$node\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "Unreclaim", + "legendFormat": "Pgmajfault", "range": true, - "refId": "D", + "refId": "B", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_vmstat{value=\"pgfault\",instance=\"$node\"}[1m]) - on(instance)rate(sysom_proc_vmstat{value=\"pgmajfault\",instance=\"$node\"}[1m])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Pgminfault", + "range": true, + "refId": "C", "step": 240 } ], - "title": "Memory Slab", + "title": "PageFault Rate", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "Page换进换出频率", "fieldConfig": { "defaults": { "color": { @@ -4691,7 +6584,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "", + "axisLabel": "counter", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -4726,26 +6619,22 @@ "steps": [ { "color": "green" - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "none" + "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", - "options": "Apps" + "options": "Active" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#629E51", + "fixedColor": "#99440A", "mode": "fixed" } } @@ -4760,7 +6649,7 @@ { "id": "color", "value": { - "fixedColor": "#614D93", + "fixedColor": "#58140C", "mode": "fixed" } } @@ -4814,13 +6703,13 @@ { "matcher": { "id": "byName", - "options": "Free" + "options": "Dirty" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#0A437C", + "fixedColor": "#6ED0E0", "mode": "fixed" } } @@ -4829,13 +6718,13 @@ { "matcher": { "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + "options": "Free" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#CFFAFF", + "fixedColor": "#B7DBAB", "mode": "fixed" } } @@ -4850,7 +6739,7 @@ { "id": "color", "value": { - "fixedColor": "#584477", + "fixedColor": "#EA6460", "mode": "fixed" } } @@ -4859,13 +6748,13 @@ { "matcher": { "id": "byName", - "options": "PageTables" + "options": "Mapped" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#0A50A1", + "fixedColor": "#052B51", "mode": "fixed" } } @@ -4874,7 +6763,7 @@ { "matcher": { "id": "byName", - "options": "Page_Tables" + "options": "PageTables" }, "properties": [ { @@ -4889,13 +6778,13 @@ { "matcher": { "id": "byName", - "options": "RAM_Free" + "options": "Page_Tables" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#E0F9D7", + "fixedColor": "#0A50A1", "mode": "fixed" } } @@ -4904,13 +6793,13 @@ { "matcher": { "id": "byName", - "options": "Slab" + "options": "Slab_Cache" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#806EB7", + "fixedColor": "#EAB839", "mode": "fixed" } } @@ -4919,13 +6808,13 @@ { "matcher": { "id": "byName", - "options": "Slab_Cache" + "options": "Swap" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#E0752D", + "fixedColor": "#BF1B00", "mode": "fixed" } } @@ -4934,13 +6823,13 @@ { "matcher": { "id": "byName", - "options": "Swap" + "options": "Swap_Cache" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#BF1B00", + "fixedColor": "#C15C17", "mode": "fixed" } } @@ -4949,13 +6838,13 @@ { "matcher": { "id": "byName", - "options": "Swap_Cache" + "options": "Total" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#C15C17", + "fixedColor": "#511749", "mode": "fixed" } } @@ -4964,13 +6853,13 @@ { "matcher": { "id": "byName", - "options": "Swap_Free" + "options": "Total RAM" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#2F575E", + "fixedColor": "#052B51", "mode": "fixed" } } @@ -4979,13 +6868,13 @@ { "matcher": { "id": "byName", - "options": "Unused" + "options": "Total RAM + Swap" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#EAB839", + "fixedColor": "#052B51", "mode": "fixed" } } @@ -4994,24 +6883,30 @@ { "matcher": { "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + "options": "Total Swap" }, "properties": [ { - "id": "custom.fillOpacity", - "value": 0 + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } } ] }, { "matcher": { "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" + "options": "VmallocUsed" }, "properties": [ { - "id": "custom.fillOpacity", - "value": 0 + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } } ] } @@ -5021,143 +6916,9 @@ "h": 10, "w": 12, "x": 0, - "y": 41 - }, - "id": 138, - "links": [], - "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "9.2.0", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pgpgin\",instance=\"$node\"}[1m])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Page In", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pgpgout\",instance=\"$node\"}[1m])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Page Out", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pswpin\",instance=\"$node\"}[1m])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "pswpin", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pswpout\",instance=\"$node\"}[1m])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "pswpout", - "range": true, - "refId": "D", - "step": 240 - } - ], - "title": "Page In/Out", - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "description": "PageFault频率", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 41 + "y": 51 }, - "id": 131, + "id": 307, "links": [], "options": { "legend": { @@ -5181,40 +6942,44 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pgfault\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_stall\"}[1m])", "format": "time_series", + "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "PageFault", + "legendFormat": "compact_stall", "range": true, - "refId": "A", + "refId": "E", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pgmajfault\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_success\"}[1m])", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "Pgmajfault", + "legendFormat": "compact_success", "range": true, - "refId": "B", + "refId": "F", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{value=\"pgfault\",instance=\"$node\"}[1m]) - on(instance)rate(sysom_proc_vmstat{value=\"pgmajfault\",instance=\"$node\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_fail\"}[1m])", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "Pgminfault", + "legendFormat": "compact_fail", "range": true, - "refId": "C", + "refId": "A", "step": 240 } ], - "title": "PageFault Rate", + "title": "Memory Compact ", "type": "timeseries" }, { @@ -5558,10 +7323,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 51 }, - "id": 307, + "id": 339, "links": [], "options": { "legend": { @@ -5585,86 +7350,122 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_stall\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_kswapd\"}[1m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "pgscan_kswapd", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_direct\"}[1m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "pgscan_direct", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"oom_kill\"}[1m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "oom_kill", + "range": true, + "refId": "D", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgsteal_kswapd\"}[1m])", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "compact_stall", + "legendFormat": "pgsteal_kswapd", "range": true, - "refId": "E", + "refId": "A", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_success\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgsteal_direct\"}[1m])", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "compact_success", + "legendFormat": "pgsteal_direct", "range": true, - "refId": "F", + "refId": "E", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"compact_fail\"}[1m])", + "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_direct_throttle\"}[1m])", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "compact_fail", + "legendFormat": "pgscan_direct_throttle", "range": true, - "refId": "A", + "refId": "F", "step": 240 } ], - "title": "Memory Compact ", + "title": "Memory Others", "type": "timeseries" - }, + } + ], + "targets": [ + { + "datasource": "sysom-prometheus", + "refId": "A" + } + ], + "title": "System Memory", + "type": "row" + }, + { + "collapsed": true, + "datasource": "sysom-prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ { "datasource": "sysom-prometheus", + "description": "\u78c1\u76d8\u7a7a\u95f4\u4f7f\u7528\u60c5\u51b5\uff0c\u7c7b\u4f3cdf -h\u7684\u7ed3\u679c", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "align": "auto", + "displayMode": "auto", + "inspect": false, + "minWidth": 50 }, - "links": [], + "decimals": 2, "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -5673,260 +7474,191 @@ } ] }, - "unit": "short" + "unit": "kbytes" }, "overrides": [ { "matcher": { "id": "byName", - "options": "Active" + "options": "Time" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Buffers" + "options": "exported_instance" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Cache" + "options": "instance" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Cached" + "options": "job" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Committed" + "options": "Value #B" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } + "id": "displayName", + "value": "Used" } ] }, { "matcher": { "id": "byName", - "options": "Dirty" + "options": "mount 2" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Free" + "options": "Value #A" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } + "id": "displayName", + "value": "Size" } ] }, { "matcher": { "id": "byName", - "options": "Inactive" + "options": "mount" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } + "id": "displayName", + "value": "Mounted on" } ] }, { "matcher": { "id": "byName", - "options": "Mapped" + "options": "counter 1" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "PageTables" + "options": "counter 2" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Page_Tables" + "options": "Time 3" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Slab_Cache" + "options": "fs 2" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "Swap" + "options": "Value #C" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } + "id": "displayName", + "value": "Available" } ] }, { "matcher": { "id": "byName", - "options": "Swap_Cache" + "options": "Use%" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ + "id": "unit", + "value": "percentunit" + }, { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ + "id": "decimals", + "value": 2 + }, { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ + "id": "custom.displayMode", + "value": "gradient-gauge" + }, + { + "id": "max", + "value": 1 + }, { "id": "color", "value": { - "fixedColor": "#052B51", - "mode": "fixed" + "mode": "continuous-GrYlRd" } } ] @@ -5934,168 +7666,138 @@ { "matcher": { "id": "byName", - "options": "Total Swap" + "options": "Total Size(ignore reserve)" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } + "id": "custom.hidden", + "value": true } ] }, { "matcher": { "id": "byName", - "options": "VmallocUsed" + "options": "fs 1" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } + "id": "displayName", + "value": "Filesystem" } ] } ] }, "gridPos": { - "h": 10, + "h": 12, "w": 12, - "x": 12, - "y": 51 + "x": 0, + "y": 4 }, - "id": 339, - "links": [], + "id": 383, "options": { - "legend": { - "calcs": [ - "min", - "mean", - "max", - "lastNotNull" + "footer": { + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Use%" + } + ] }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_kswapd\"}[1m])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "pgscan_kswapd", - "range": true, - "refId": "B", - "step": 240 + "exemplar": false, + "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_blocks\"} * 4", + "format": "table", + "instant": true, + "legendFormat": "{{fs}}", + "range": false, + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_direct\"}[1m])", - "format": "time_series", + "exemplar": false, + "expr": "(sysom_fs_stat{instance=\"$node\",counter=\"f_blocks\"} - on(mount)sysom_fs_stat{instance=\"$node\",counter=\"f_bfree\"}) * 4", + "format": "table", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "pgscan_direct", - "range": true, - "refId": "C", - "step": 240 + "instant": true, + "legendFormat": "{{fs}}", + "range": false, + "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"oom_kill\"}[1m])", - "format": "time_series", + "exemplar": false, + "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_bavail\"} * 4", + "format": "table", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "oom_kill", - "range": true, - "refId": "D", - "step": 240 - }, + "instant": true, + "legendFormat": "{{fs}}", + "range": false, + "refId": "C" + } + ], + "title": "df -h info", + "transformations": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgsteal_kswapd\"}[1m])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "pgsteal_kswapd", - "range": true, - "refId": "A", - "step": 240 + "id": "joinByField", + "options": { + "byField": "mount", + "mode": "outer" + } }, { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgsteal_direct\"}[1m])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "pgsteal_direct", - "range": true, - "refId": "E", - "step": 240 + "id": "calculateField", + "options": { + "alias": "Total Size(ignore reserve)", + "binary": { + "left": "Value #B", + "operator": "+", + "reducer": "sum", + "right": "Value #C" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } }, { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "rate(sysom_proc_vmstat{instance=\"$node\",value=\"pgscan_direct_throttle\"}[1m])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "pgscan_direct_throttle", - "range": true, - "refId": "F", - "step": 240 + "id": "calculateField", + "options": { + "alias": "Use%", + "binary": { + "left": "Value #B", + "operator": "/", + "reducer": "sum", + "right": "Total Size(ignore reserve)" + }, + "mode": "binary", + "reduce": { + "reducer": "sum" + } + } } ], - "title": "Memory Others", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": "sysom-prometheus", - "refId": "A" - } - ], - "title": "System Memory", - "type": "row" - }, - { - "collapsed": true, - "datasource": "sysom-prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 73 - }, - "id": 270, - "panels": [ + "type": "table" + }, { "datasource": "sysom-prometheus", - "description": "磁盘空间使用情况,类似df -h的结果", + "description": "\u7c7b\u4f3cdf -i\u7684\u7ed3\u679c\uff0c\u67e5\u770b\u6587\u4ef6\u7cfb\u7edfinode\u4f7f\u7528\u60c5\u51b5", "fieldConfig": { "defaults": { "color": { @@ -6107,7 +7809,6 @@ "inspect": false, "minWidth": 50 }, - "decimals": 2, "mappings": [], "thresholds": { "mode": "absolute", @@ -6117,7 +7818,7 @@ } ] }, - "unit": "kbytes" + "unit": "none" }, "overrides": [ { @@ -6135,19 +7836,7 @@ { "matcher": { "id": "byName", - "options": "exported_instance" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" + "options": "__name__" }, "properties": [ { @@ -6159,7 +7848,7 @@ { "matcher": { "id": "byName", - "options": "job" + "options": "exported_instance" }, "properties": [ { @@ -6176,7 +7865,7 @@ "properties": [ { "id": "displayName", - "value": "Used" + "value": "IUsed" } ] }, @@ -6200,7 +7889,7 @@ "properties": [ { "id": "displayName", - "value": "Size" + "value": "Inodes" } ] }, @@ -6219,43 +7908,19 @@ { "matcher": { "id": "byName", - "options": "counter 1" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "counter 2" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Time 3" + "options": "fs 1" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "displayName", + "value": "Filesystem" } ] }, { "matcher": { "id": "byName", - "options": "fs 2" + "options": "counter" }, "properties": [ { @@ -6272,14 +7937,14 @@ "properties": [ { "id": "displayName", - "value": "Available" + "value": "IFree" } ] }, { "matcher": { "id": "byName", - "options": "Use%" + "options": "IUse%" }, "properties": [ { @@ -6309,7 +7974,7 @@ { "matcher": { "id": "byName", - "options": "Total Size(ignore reserve)" + "options": "job" }, "properties": [ { @@ -6321,12 +7986,24 @@ { "matcher": { "id": "byName", - "options": "fs 1" + "options": "instance" }, "properties": [ { - "id": "displayName", - "value": "Filesystem" + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "fs 2" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true } ] } @@ -6335,10 +8012,10 @@ "gridPos": { "h": 12, "w": 12, - "x": 0, + "x": 12, "y": 4 }, - "id": 383, + "id": 385, "options": { "footer": { "enablePagination": true, @@ -6352,7 +8029,7 @@ "sortBy": [ { "desc": true, - "displayName": "Use%" + "displayName": "IUse%" } ] }, @@ -6362,7 +8039,7 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": false, - "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_blocks\"} * 4", + "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_files\"}", "format": "table", "instant": true, "legendFormat": "{{fs}}", @@ -6373,7 +8050,7 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": false, - "expr": "(sysom_fs_stat{instance=\"$node\",counter=\"f_blocks\"} - on(mount)sysom_fs_stat{instance=\"$node\",counter=\"f_bfree\"}) * 4", + "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_files\"} - on(mount)sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}", "format": "table", "hide": false, "instant": true, @@ -6385,7 +8062,7 @@ "datasource": "sysom-prometheus", "editorMode": "code", "exemplar": false, - "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_bavail\"} * 4", + "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}", "format": "table", "hide": false, "instant": true, @@ -6394,7 +8071,7 @@ "refId": "C" } ], - "title": "df -h info", + "title": "df -i info", "transformations": [ { "id": "joinByField", @@ -6406,58 +8083,249 @@ { "id": "calculateField", "options": { - "alias": "Total Size(ignore reserve)", + "alias": "IUse%", "binary": { "left": "Value #B", - "operator": "+", + "operator": "/", "reducer": "sum", - "right": "Value #C" + "right": "Value #A" }, "mode": "binary", "reduce": { "reducer": "sum" } } + } + ], + "type": "table" + }, + { + "datasource": "sysom-prometheus", + "description": "\u6587\u4ef6\u7cfb\u7edf\u7a7a\u95f4\u4f7f\u7528\u91cf", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "KBs" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 387, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ { - "id": "calculateField", - "options": { - "alias": "Use%", - "binary": { - "left": "Value #B", - "operator": "/", - "reducer": "sum", - "right": "Total Size(ignore reserve)" + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "- delta(sysom_fs_stat{instance=\"$node\",counter=\"f_bfree\"}[1m]) * 4 / 60", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "{{mount}}({{fs}})", + "range": true, + "refId": "B" + } + ], + "title": "Filesystem used increase rate", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "Filesystem Inode \u4f7f\u7528\u60c5\u51b5", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 386, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "-delta(sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}[1m])", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "{{mount}}({{fs}})", + "range": true, + "refId": "B" + } + ], + "title": "Filesystem Inode used increase rate", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "description": "\u78c1\u76d8\u53d1\u751fio hang\u7684\u6b21\u6570", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "IOs", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "mode": "binary", - "reduce": { - "reducer": "sum" + "thresholdsStyle": { + "mode": "off" } - } - } - ], - "type": "table" - }, - { - "datasource": "sysom-prometheus", - "description": "类似df -i的结果,查看文件系统inode使用情况", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "displayMode": "auto", - "inspect": false, - "minWidth": 50 }, + "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, @@ -6466,433 +8334,323 @@ "overrides": [ { "matcher": { - "id": "byName", - "options": "Time" + "id": "byRegexp", + "options": "/.*sda_.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "__name__" + "id": "byRegexp", + "options": "/.*sdb_.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "exported_instance" + "id": "byRegexp", + "options": "/.*sdc_.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "Value #B" + "id": "byRegexp", + "options": "/.*sdd_.*/" }, "properties": [ { - "id": "displayName", - "value": "IUsed" + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "mount 2" + "id": "byRegexp", + "options": "/.*sde_.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "Value #A" + "id": "byRegexp", + "options": "/.*sda1.*/" }, "properties": [ { - "id": "displayName", - "value": "Inodes" + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "mount" + "id": "byRegexp", + "options": "/.*sda2_.*/" }, "properties": [ { - "id": "displayName", - "value": "Mounted on" + "id": "color", + "value": { + "fixedColor": "#BA43A9", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "fs 1" + "id": "byRegexp", + "options": "/.*sda3_.*/" }, "properties": [ { - "id": "displayName", - "value": "Filesystem" + "id": "color", + "value": { + "fixedColor": "#F4D598", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "counter" + "id": "byRegexp", + "options": "/.*sdb1.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "Value #C" + "id": "byRegexp", + "options": "/.*sdb2.*/" }, "properties": [ { - "id": "displayName", - "value": "IFree" + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "IUse%" + "id": "byRegexp", + "options": "/.*sdb3.*/" }, "properties": [ { - "id": "unit", - "value": "percentunit" - }, + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc1.*/" + }, + "properties": [ { - "id": "decimals", - "value": 2 - }, + "id": "color", + "value": { + "fixedColor": "#962D82", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc2.*/" + }, + "properties": [ { - "id": "custom.displayMode", - "value": "gradient-gauge" - }, + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdc3.*/" + }, + "properties": [ { - "id": "max", - "value": 1 - }, + "id": "color", + "value": { + "fixedColor": "#9AC48A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd1.*/" + }, + "properties": [ { "id": "color", "value": { - "mode": "continuous-GrYlRd" + "fixedColor": "#65C5DB", + "mode": "fixed" } } ] }, { "matcher": { - "id": "byName", - "options": "job" + "id": "byRegexp", + "options": "/.*sdd2.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#F9934E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sdd3.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*sde1.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "instance" + "id": "byRegexp", + "options": "/.*sdd2.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#FCEACA", + "mode": "fixed" + } } ] }, { "matcher": { - "id": "byName", - "options": "fs 2" + "id": "byRegexp", + "options": "/.*sde3.*/" }, "properties": [ { - "id": "custom.hidden", - "value": true + "id": "color", + "value": { + "fixedColor": "#F9E2D2", + "mode": "fixed" + } } ] } ] }, "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 4 - }, - "id": 385, - "options": { - "footer": { - "enablePagination": true, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "IUse%" - } - ] - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_files\"}", - "format": "table", - "instant": true, - "legendFormat": "{{fs}}", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_files\"} - on(mount)sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "{{fs}}", - "range": false, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "{{fs}}", - "range": false, - "refId": "C" - } - ], - "title": "df -i info", - "transformations": [ - { - "id": "joinByField", - "options": { - "byField": "mount", - "mode": "outer" - } - }, - { - "id": "calculateField", - "options": { - "alias": "IUse%", - "binary": { - "left": "Value #B", - "operator": "/", - "reducer": "sum", - "right": "Value #A" - }, - "mode": "binary", - "reduce": { - "reducer": "sum" - } - } - } - ], - "type": "table" - }, - { - "datasource": "sysom-prometheus", - "description": "文件系统空间使用量", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - }, - "unit": "KBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 387, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.2", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "- delta(sysom_fs_stat{instance=\"$node\",counter=\"f_bfree\"}[1m]) * 4 / 60", - "format": "time_series", - "hide": false, - "instant": false, - "legendFormat": "{{mount}}({{fs}})", - "range": true, - "refId": "B" - } - ], - "title": "Filesystem used increase rate", - "transformations": [], - "type": "timeseries" - }, - { - "datasource": "sysom-prometheus", - "description": "Filesystem Inode 使用情况", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, + "h": 10, "w": 12, - "x": 12, - "y": 16 + "x": 0, + "y": 28 }, - "id": 386, + "id": 301, + "links": [], "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, @@ -6901,28 +8659,26 @@ "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "-delta(sysom_fs_stat{instance=\"$node\",counter=\"f_ffree\"}[1m])", - "format": "time_series", - "hide": false, - "instant": false, - "legendFormat": "{{mount}}({{fs}})", + "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"iohangCnt\"}", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{devname}}", "range": true, - "refId": "B" + "refId": "A", + "step": 240 } ], - "title": "Filesystem Inode used increase rate", - "transformations": [], + "title": "Disk IO iohangCnt", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "磁盘发生io hang的次数", + "description": "\u78c1\u76d8\u53d1\u751fio delay\u7684\u6b21\u6570", "fieldConfig": { "defaults": { "color": { @@ -7280,10 +9036,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 28 }, - "id": 301, + "id": 401, "links": [], "options": { "legend": { @@ -7307,7 +9063,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"iohangCnt\"}", + "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"iodelayCnt\"}", "interval": "", "intervalFactor": 4, "legendFormat": "{{devname}}", @@ -7316,12 +9072,12 @@ "step": 240 } ], - "title": "Disk IO iohangCnt", + "title": "Disk IO iodelayCnt", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "磁盘发生io delay的次数", + "description": "\u78c1\u76d8\u53d1\u751fio burst\u7684\u6b21\u6570", "fieldConfig": { "defaults": { "color": { @@ -7679,10 +9435,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 28 + "x": 0, + "y": 38 }, - "id": 401, + "id": 402, "links": [], "options": { "legend": { @@ -7706,7 +9462,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"iodelayCnt\"}", + "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"ioburstCnt\"}", "interval": "", "intervalFactor": 4, "legendFormat": "{{devname}}", @@ -7715,12 +9471,12 @@ "step": 240 } ], - "title": "Disk IO iodelayCnt", + "title": "Disk IO ioburstCnt", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "磁盘发生io burst的次数", + "description": "\u7cfb\u7edf\u53d1\u751fio wait \u9ad8\u7684\u6b21\u6570", "fieldConfig": { "defaults": { "color": { @@ -8078,10 +9834,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 38 }, - "id": 402, + "id": 403, "links": [], "options": { "legend": { @@ -8105,21 +9861,21 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_IOMonIndForDisksIO{instance=\"$node\",value=\"ioburstCnt\"}", + "expr": "sysom_IOMonIndForSystemIO{instance=\"$node\",value=\"iowaithighCnt\"}", "interval": "", "intervalFactor": 4, - "legendFormat": "{{devname}}", + "legendFormat": "{{devname}} iowait high count", "range": true, "refId": "A", "step": 240 } ], - "title": "Disk IO ioburstCnt", + "title": "System IO iowaithighCnt", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "系统发生io wait 高的次数", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -8169,7 +9925,7 @@ } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [ { @@ -8477,10 +10233,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 38 + "x": 0, + "y": 48 }, - "id": 403, + "id": 400, "links": [], "options": { "legend": { @@ -8504,21 +10260,21 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_IOMonIndForSystemIO{instance=\"$node\",value=\"iowaithighCnt\"}", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"busy\",disk_name!~\"[a-z]*[0-9]$\"}", "interval": "", "intervalFactor": 4, - "legendFormat": "{{devname}} iowait high count", + "legendFormat": "{{disk_name}}", "range": true, "refId": "A", "step": 240 } ], - "title": "System IO iowaithighCnt", + "title": "Disk IO Utils", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", + "description": "The number (after merges) of I/O requests completed per second for the device", "fieldConfig": { "defaults": { "color": { @@ -8527,7 +10283,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "IOs", + "axisLabel": "IO read (-) / write (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -8568,9 +10324,21 @@ } ] }, - "unit": "percent" + "unit": "iops" }, "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, { "matcher": { "id": "byRegexp", @@ -8876,10 +10644,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 48 }, - "id": 400, + "id": 9, "links": [], "options": { "legend": { @@ -8903,21 +10671,30 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"busy\",disk_name!~\"[a-z]*[0-9]$\"}", - "interval": "", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"reads\",disk_name!~\"[a-z]*[0-9]$\"}", "intervalFactor": 4, - "legendFormat": "{{disk_name}}", + "legendFormat": "{{disk_name}} - Reads completed", "range": true, "refId": "A", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"writes\",disk_name!~\"[a-z]*[0-9]$\"}", + "intervalFactor": 1, + "legendFormat": "{{disk_name}} - Writes completed", + "range": true, + "refId": "B", + "step": 240 } ], - "title": "Disk IO Utils", + "title": "Disk IOps Completed", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The number (after merges) of I/O requests completed per second for the device", + "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", "fieldConfig": { "defaults": { "color": { @@ -8926,11 +10703,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "IO read (-) / write (+)", + "axisLabel": "time. read (-) / write (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, @@ -8967,7 +10744,7 @@ } ] }, - "unit": "iops" + "unit": "ms" }, "overrides": [ { @@ -9287,10 +11064,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 48 + "x": 0, + "y": 58 }, - "id": 9, + "id": 37, "links": [], "options": { "legend": { @@ -9314,9 +11091,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"reads\",disk_name!~\"[a-z]*[0-9]$\"}", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rmsec\",disk_name!~\"[a-z]*[0-9]$\"}", + "hide": false, + "interval": "", "intervalFactor": 4, - "legendFormat": "{{disk_name}} - Reads completed", + "legendFormat": "{{disk_name}} - Read wait time avg", "range": true, "refId": "A", "step": 240 @@ -9324,20 +11103,22 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"writes\",disk_name!~\"[a-z]*[0-9]$\"}", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wmsec\",disk_name!~\"[a-z]*[0-9]$\"}", + "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "{{disk_name}} - Writes completed", + "legendFormat": "{{disk_name}} - Write wait time avg", "range": true, "refId": "B", "step": 240 } ], - "title": "Disk IOps Completed", + "title": "Disk Average Wait Time", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "description": "The number of bytes read from or written to the device per second", "fieldConfig": { "defaults": { "color": { @@ -9346,11 +11127,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "time. read (-) / write (+)", + "axisLabel": "bytes read (-) / write (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 30, + "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, @@ -9387,7 +11168,7 @@ } ] }, - "unit": "ms" + "unit": "KBs" }, "overrides": [ { @@ -9707,10 +11488,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 58 }, - "id": 37, + "id": 33, "links": [], "options": { "legend": { @@ -9734,11 +11515,10 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rmsec\",disk_name!~\"[a-z]*[0-9]$\"}", - "hide": false, - "interval": "", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rkb\",disk_name!~\"[a-z]*[0-9]$\"}", + "format": "time_series", "intervalFactor": 4, - "legendFormat": "{{disk_name}} - Read wait time avg", + "legendFormat": "{{disk_name}} - Read bytes", "range": true, "refId": "A", "step": 240 @@ -9746,22 +11526,21 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wmsec\",disk_name!~\"[a-z]*[0-9]$\"}", - "hide": false, - "interval": "", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wkb\",disk_name!~\"[a-z]*[0-9]$\"}", + "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{disk_name}} - Write wait time avg", + "legendFormat": "{{disk_name}} - Written bytes", "range": true, "refId": "B", "step": 240 } ], - "title": "Disk Average Wait Time", + "title": "Disk R/W Data", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The number of bytes read from or written to the device per second", + "description": "The average queue length of the requests that were issued to the device", "fieldConfig": { "defaults": { "color": { @@ -9770,7 +11549,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", + "axisLabel": "aqu-sz", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -9797,8 +11576,10 @@ "mode": "off" } }, + "decimals": 2, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -9811,21 +11592,9 @@ } ] }, - "unit": "KBs" + "unit": "none" }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, { "matcher": { "id": "byRegexp", @@ -10131,10 +11900,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 58 + "x": 0, + "y": 68 }, - "id": 33, + "id": 35, "links": [], "options": { "legend": { @@ -10158,32 +11927,21 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rkb\",disk_name!~\"[a-z]*[0-9]$\"}", - "format": "time_series", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"backlog\",disk_name!~\"[a-z]*[0-9]$\"} / 1000", + "interval": "", "intervalFactor": 4, - "legendFormat": "{{disk_name}} - Read bytes", + "legendFormat": "{{disk_name}}", "range": true, "refId": "A", "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wkb\",disk_name!~\"[a-z]*[0-9]$\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{disk_name}} - Written bytes", - "range": true, - "refId": "B", - "step": 240 } ], - "title": "Disk R/W Data", + "title": "Average Queue Size", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The average queue length of the requests that were issued to the device", + "description": "The number of read and write requests merged per second that were queued to the device", "fieldConfig": { "defaults": { "color": { @@ -10192,7 +11950,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "aqu-sz", + "axisLabel": "I/Os", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -10219,10 +11977,8 @@ "mode": "off" } }, - "decimals": 2, "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -10235,9 +11991,21 @@ } ] }, - "unit": "none" + "unit": "iops" }, "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, { "matcher": { "id": "byRegexp", @@ -10543,10 +12311,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 0, + "x": 12, "y": 68 }, - "id": 35, + "id": 133, "links": [], "options": { "legend": { @@ -10570,21 +12338,30 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"backlog\",disk_name!~\"[a-z]*[0-9]$\"} / 1000", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{disk_name}}", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rmerge\",disk_name!~\"[a-z]*[0-9]$\"}", + "intervalFactor": 1, + "legendFormat": "{{disk_name}} - Read merged", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wmerge\",disk_name!~\"[a-z]*[0-9]$\"}", + "intervalFactor": 1, + "legendFormat": "{{disk_name}} - Write merged", "range": true, - "refId": "A", + "refId": "B", "step": 240 } ], - "title": "Average Queue Size", + "title": "Disk R/W Merged", "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "The number of read and write requests merged per second that were queued to the device", + "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", "fieldConfig": { "defaults": { "color": { @@ -10593,7 +12370,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "I/Os", + "axisLabel": "Outstanding req", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -10622,6 +12399,7 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -10634,21 +12412,9 @@ } ] }, - "unit": "iops" + "unit": "none" }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, { "matcher": { "id": "byRegexp", @@ -10954,10 +12720,10 @@ "gridPos": { "h": 10, "w": 12, - "x": 12, - "y": 68 + "x": 0, + "y": 78 }, - "id": 133, + "id": 34, "links": [], "options": { "legend": { @@ -10971,238 +12737,475 @@ "placement": "bottom", "showLegend": true }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.2.0", - "targets": [ + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_disks{instance=\"$node\",counter=\"inflight\",disk_name!~\"[a-z]*[0-9]$\"}", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{disk_name}} - IO now", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Inflight IO/s", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": "sysom-prometheus", + "refId": "A" + } + ], + "title": "Storage Filesystem and IO", + "type": "row" + }, + { + "collapsed": true, + "datasource": "sysom-prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 272, + "panels": [ + { + "datasource": "sysom-prometheus", + "description": "\u7edf\u8ba1\u5404\u4e2a\u7f51\u53e3\u7684\u6536\u53d1\u5305\u60c5\u51b5", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 0, + "mappings": [], + "min": -2, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 5 + }, + "id": 393, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "asc" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ipackets\"}", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "{{network_name}} rx", + "range": false, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_opackets\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{network_name}} tx", + "range": false, + "refId": "B" + } + ], + "title": "Network Traffic RTX", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "decimals": 1, + "mappings": [], + "min": -4, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 5 + }, + "id": 394, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_tcp_count{instance=\"$node\",value=\"InSegs\"}", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "tcp in", + "range": false, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_tcp_count{instance=\"$node\",value=\"OutSegs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "tcp out", + "range": false, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_udp_count{instance=\"$node\",value=\"InDatagrams\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "udp in", + "range": false, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_udp_count{instance=\"$node\",value=\"OutDatagrams\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "udp out", + "range": false, + "refId": "D" + }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"rmerge\",disk_name!~\"[a-z]*[0-9]$\"}", - "intervalFactor": 1, - "legendFormat": "{{disk_name}} - Read merged", - "range": true, - "refId": "A", - "step": 240 + "exemplar": false, + "expr": "sysom_net_ip_count{instance=\"$node\",value=\"InReceives\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "ip in", + "range": false, + "refId": "E" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"wmerge\",disk_name!~\"[a-z]*[0-9]$\"}", - "intervalFactor": 1, - "legendFormat": "{{disk_name}} - Write merged", - "range": true, - "refId": "B", - "step": 240 + "exemplar": false, + "expr": "sysom_net_ip_count{instance=\"$node\",value=\"OutRequests\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "ip out", + "range": false, + "refId": "F" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_icmp_count{instance=\"$node\",value=\"InMsgs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "icmp in", + "range": false, + "refId": "G" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sysom_net_icmp_count{instance=\"$node\",value=\"OutMsgs\"}", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "icmp out", + "range": false, + "refId": "H" } ], - "title": "Disk R/W Merged", - "type": "timeseries" + "title": "Netstat RTX Graph", + "type": "piechart" }, { "datasource": "sysom-prometheus", - "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", + "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Outstanding req", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } }, - "links": [], + "decimals": 1, "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" + "min": -4, + "unit": "pps" }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 5 + }, + "id": 397, + "options": { + "displayLabels": [ + "percent", + "value" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "icmp InError", + "range": false, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sum(irate(sysom_net_udp_count{instance=\"$node\",value!=\"InDatagrams\",value!=\"OutDatagrams\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "udp errors", + "range": false, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sum (irate(sysom_net_tcp_ext_count{instance=\"$node\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "tcp errors", + "range": false, + "refId": "G" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "exemplar": false, + "expr": "sum(sysom_net_retrans_count{instance=\"$node\"})", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "tcp retrans", + "range": false, + "refId": "C" + } + ], + "title": "Netstat Error Count", + "type": "piechart" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] + "thresholdsStyle": { + "mode": "off" + } }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ + "color": "green" + }, { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } + "color": "red", + "value": 80 } ] }, + "unit": "pps" + }, + "overrides": [ { "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" + "id": "byName", + "options": "receive_packets_eth0" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#BF1B00", + "fixedColor": "#7EB26D", "mode": "fixed" } } @@ -11210,14 +13213,14 @@ }, { "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" + "id": "byName", + "options": "receive_packets_lo" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#E0752D", + "fixedColor": "#E24D42", "mode": "fixed" } } @@ -11225,14 +13228,14 @@ }, { "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" + "id": "byName", + "options": "transmit_packets_eth0" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#962D82", + "fixedColor": "#7EB26D", "mode": "fixed" } } @@ -11240,14 +13243,14 @@ }, { "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" + "id": "byName", + "options": "transmit_packets_lo" }, "properties": [ { "id": "color", "value": { - "fixedColor": "#614D93", + "fixedColor": "#E24D42", "mode": "fixed" } } @@ -11256,105 +13259,258 @@ { "matcher": { "id": "byRegexp", - "options": "/.*sdc3.*/" + "options": "/.*Trans.*/" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } + "id": "custom.transform", + "value": "negative-Y" } ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 60, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ipackets\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Receive", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_opackets\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Transmit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic by Packets", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "properties": [ + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, + "unit": "pps" + }, + "overrides": [ { "matcher": { "id": "byRegexp", - "options": "/.*sde1.*/" + "options": "/.*Trans.*/" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } + "id": "custom.transform", + "value": "negative-Y" } ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 143, + "links": [], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_idrop\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Receive drop", + "range": true, + "refId": "A", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_odrop\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Transmit drop", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Drop", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "properties": [ + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, + "unit": "pps" + }, + "overrides": [ { "matcher": { "id": "byRegexp", - "options": "/.*sde3.*/" + "options": "/.*Trans.*/" }, "properties": [ { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } + "id": "custom.transform", + "value": "negative-Y" } ] } @@ -11364,9 +13520,9 @@ "h": 10, "w": 12, "x": 0, - "y": 78 + "y": 24 }, - "id": 34, + "id": 146, "links": [], "options": { "legend": { @@ -11378,10 +13534,11 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "width": 300 }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, @@ -11390,400 +13547,412 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_disks{instance=\"$node\",counter=\"inflight\",disk_name!~\"[a-z]*[0-9]$\"}", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{disk_name}} - IO now", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_imulticast\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Receive multicast", "range": true, "refId": "A", "step": 240 } ], - "title": "Inflight IO/s", + "title": "Network Traffic Multicast", "type": "timeseries" - } - ], - "targets": [ - { - "datasource": "sysom-prometheus", - "refId": "A" - } - ], - "title": "Storage Filesystem and IO", - "type": "row" - }, - { - "collapsed": true, - "datasource": "sysom-prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 74 - }, - "id": 272, - "panels": [ + }, { "datasource": "sysom-prometheus", - "description": "统计各个网口的收发包情况", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "packets out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, - "decimals": 0, + "links": [], "mappings": [], - "min": -2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, "unit": "pps" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Trans.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] }, "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 5 + "h": 10, + "w": 12, + "x": 12, + "y": 24 }, - "id": 393, + "id": 142, + "links": [], "options": { - "displayLabels": [ - "percent", - "value" - ], "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { "calcs": [ - "lastNotNull" + "mean", + "lastNotNull", + "max", + "min" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 }, "tooltip": { - "mode": "single", - "sort": "asc" + "mode": "multi", + "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ipackets\"}", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ierrs\"}", "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "{{network_name}} rx", - "range": false, - "refId": "A" + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Receive errors", + "range": true, + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_opackets\"}", + "expr": "sysom_proc_networks{instance=\"$node\",counter=\"of_ierrs\"}", "format": "time_series", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "{{network_name}} tx", - "range": false, - "refId": "B" + "intervalFactor": 1, + "legendFormat": "{{network_name}} - Rransmit errors", + "range": true, + "refId": "B", + "step": 240 } ], - "title": "Network Traffic RTX", - "type": "piechart" + "title": "Network Traffic Errors", + "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, - "decimals": 1, + "links": [], "mappings": [], - "min": -4, - "unit": "pps" + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 5 + "h": 10, + "w": 12, + "x": 0, + "y": 34 }, - "id": 394, + "id": 63, + "links": [], "options": { - "displayLabels": [ - "percent", - "value" - ], "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { "calcs": [ - "lastNotNull" + "mean", + "lastNotNull", + "max", + "min" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_tcp_count{instance=\"$node\",value=\"InSegs\"}", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "tcp in", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_tcp_count{instance=\"$node\",value=\"OutSegs\"}", - "format": "time_series", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "tcp out", - "range": false, - "refId": "B" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_udp_count{instance=\"$node\",value=\"InDatagrams\"}", - "format": "time_series", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "udp in", - "range": false, - "refId": "C" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_udp_count{instance=\"$node\",value=\"OutDatagrams\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_alloc\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "udp out", - "range": false, - "refId": "D" + "intervalFactor": 1, + "legendFormat": "TCP_alloc - Allocated sockets", + "range": true, + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_ip_count{instance=\"$node\",value=\"InReceives\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_inuse\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "ip in", - "range": false, - "refId": "E" + "intervalFactor": 1, + "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "range": true, + "refId": "B", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_ip_count{instance=\"$node\",value=\"OutRequests\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_mem\"}", "format": "time_series", - "hide": false, - "instant": true, + "hide": true, "interval": "", - "legendFormat": "ip out", - "range": false, - "refId": "F" + "intervalFactor": 1, + "legendFormat": "TCP_mem - Used memory for tcp", + "range": true, + "refId": "C", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_icmp_count{instance=\"$node\",value=\"InMsgs\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_orphan\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "icmp in", - "range": false, - "refId": "G" + "intervalFactor": 1, + "legendFormat": "TCP_orphan - Orphan sockets", + "range": true, + "refId": "D", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sysom_net_icmp_count{instance=\"$node\",value=\"OutMsgs\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_tw\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "icmp out", - "range": false, - "refId": "H" + "intervalFactor": 1, + "legendFormat": "TCP_tw - Sockets wating close", + "range": true, + "refId": "E", + "step": 240 } ], - "title": "Netstat RTX Graph", - "type": "piechart" + "title": "Sockstat TCP", + "type": "timeseries" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "counter", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } }, - "decimals": 1, + "links": [], "mappings": [], - "min": -4, - "unit": "pps" + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 5 + "h": 10, + "w": 12, + "x": 12, + "y": 34 }, - "id": 397, + "id": 124, + "links": [], "options": { - "displayLabels": [ - "percent", - "value" - ], "legend": { - "displayMode": "table", - "placement": "right", - "showLegend": true, - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { "calcs": [ - "lastNotNull" + "mean", + "lastNotNull", + "max", + "min" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 }, "tooltip": { - "mode": "single", + "mode": "multi", "sort": "none" } }, - "pluginVersion": "9.2.2", + "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "icmp InError", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sum(irate(sysom_net_udp_count{instance=\"$node\",value!=\"InDatagrams\",value!=\"OutDatagrams\"}[$__rate_interval]))", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"udplite_inuse\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "udp errors", - "range": false, - "refId": "B" + "intervalFactor": 1, + "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", + "range": true, + "refId": "A", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sum (irate(sysom_net_tcp_ext_count{instance=\"$node\"}[$__rate_interval]))", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_inuse\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "tcp errors", - "range": false, - "refId": "G" + "intervalFactor": 1, + "legendFormat": "UDP_inuse - Udp sockets currently in use", + "range": true, + "refId": "B", + "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "exemplar": false, - "expr": "sum(sysom_net_retrans_count{instance=\"$node\"})", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_mem\"}", "format": "time_series", - "hide": false, - "instant": true, "interval": "", - "legendFormat": "tcp retrans", - "range": false, - "refId": "C" + "intervalFactor": 1, + "legendFormat": "UDP_mem - Used memory for udp", + "range": true, + "refId": "C", + "step": 240 } ], - "title": "Netstat Error Count", - "type": "piechart" + "title": "Sockstat UDP", + "type": "timeseries" }, { "datasource": "sysom-prometheus", @@ -11795,7 +13964,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "packets out (-) / in (+)", + "axisLabel": "sockets", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -11824,6 +13993,7 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -11836,90 +14006,17 @@ } ] }, - "unit": "pps" + "unit": "short" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "receive_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "receive_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 10, "w": 12, "x": 0, - "y": 14 + "y": 44 }, - "id": 60, + "id": 126, "links": [], "options": { "legend": { @@ -11944,29 +14041,17 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ipackets\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"sockets_used\"}", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "{{network_name}} - Receive", + "legendFormat": "Sockets_used - Sockets currently in use", "range": true, "refId": "A", "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_opackets\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{network_name}} - Transmit", - "range": true, - "refId": "B", - "step": 240 } ], - "title": "Network Traffic by Packets", + "title": "Sockstat Used", "type": "timeseries" }, { @@ -11979,7 +14064,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "packets out (-) / in (+)", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12008,42 +14093,26 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "pps" + "unit": "kbytes" }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 10, "w": 12, "x": 12, - "y": 14 + "y": 44 }, - "id": 143, + "id": 220, "links": [], "options": { "legend": { @@ -12068,10 +14137,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_idrop\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_mem\"} * 4", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{network_name}} - Receive drop", + "legendFormat": "mem_bytes - TCP sockets in that state", "range": true, "refId": "A", "step": 240 @@ -12079,16 +14149,27 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_odrop\"}", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_mem\"} * 4", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{network_name}} - Transmit drop", + "legendFormat": "mem_bytes - UDP sockets in that state", "range": true, "refId": "B", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_sock_stat{instance=\"$node\",value=\"frag_mem\"} * 4", + "interval": "", + "intervalFactor": 1, + "legendFormat": "FRAG_memory - Used memory for frag", + "range": true, + "refId": "C" } ], - "title": "Network Traffic Drop", + "title": "Sockstat Memory Size", "type": "timeseries" }, { @@ -12101,7 +14182,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "packets out (-) / in (+)", + "axisLabel": "octects out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12142,13 +14223,13 @@ } ] }, - "unit": "pps" + "unit": "short" }, "overrides": [ { "matcher": { "id": "byRegexp", - "options": "/.*Trans.*/" + "options": "/.*Out.*/" }, "properties": [ { @@ -12163,9 +14244,9 @@ "h": 10, "w": 12, "x": 0, - "y": 24 + "y": 54 }, - "id": 146, + "id": 221, "links": [], "options": { "legend": { @@ -12190,16 +14271,28 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_imulticast\"}", + "expr": "irate(sysom_net_ip_count{instance=\"$node\",value=\"InReceives\"}[$__rate_interval])", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{network_name}} - Receive multicast", + "legendFormat": "InOctets - Received octets", "range": true, "refId": "A", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_ip_count{instance=\"$node\",value=\"OutRequests\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "OutOctets - Sent octets", + "range": true, + "refId": "B", + "step": 240 } ], - "title": "Network Traffic Multicast", + "title": "Netstat IP In / Out Octets", "type": "timeseries" }, { @@ -12212,7 +14305,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "packets out (-) / in (+)", + "axisLabel": "datagrams", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12241,6 +14334,7 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -12253,30 +14347,17 @@ } ] }, - "unit": "pps" + "unit": "short" }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 10, "w": 12, "x": 12, - "y": 24 + "y": 54 }, - "id": 142, + "id": 81, "links": [], "options": { "legend": { @@ -12301,27 +14382,17 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"if_ierrs\"}", + "expr": "rate(sysom_net_ip_count{instance=\"$node\",value=\"Forwarding\"}[$__rate_interval])", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{network_name}} - Receive errors", + "legendFormat": "Forwarding - IP forwarding", "range": true, "refId": "A", "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_proc_networks{instance=\"$node\",counter=\"of_ierrs\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{network_name}} - Rransmit errors", - "range": true, - "refId": "B", - "step": 240 } ], - "title": "Network Traffic Errors", + "title": "Netstat IP Forwarding", "type": "timeseries" }, { @@ -12334,7 +14405,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "counter", + "axisLabel": "messages out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12363,7 +14434,6 @@ }, "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -12378,15 +14448,28 @@ }, "unit": "short" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] }, "gridPos": { "h": 10, "w": 12, "x": 0, - "y": 34 + "y": 64 }, - "id": 63, + "id": 115, "links": [], "options": { "legend": { @@ -12398,8 +14481,7 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 300 + "showLegend": true }, "tooltip": { "mode": "multi", @@ -12411,11 +14493,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_alloc\"}", + "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InMsgs\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "TCP_alloc - Allocated sockets", + "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", "range": true, "refId": "A", "step": 240 @@ -12423,54 +14505,17 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_inuse\"}", + "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"OutMsgs\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", "range": true, "refId": "B", "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_mem\"}", - "format": "time_series", - "hide": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "TCP_mem - Used memory for tcp", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_orphan\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "TCP_orphan - Orphan sockets", - "range": true, - "refId": "D", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_tw\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "TCP_tw - Sockets wating close", - "range": true, - "refId": "E", - "step": 240 } ], - "title": "Sockstat TCP", + "title": "ICMP In / Out", "type": "timeseries" }, { @@ -12483,7 +14528,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "counter", + "axisLabel": "messages out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12512,7 +14557,6 @@ }, "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -12527,74 +14571,62 @@ }, "unit": "short" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] }, "gridPos": { "h": 10, "w": 12, "x": 12, - "y": 34 + "y": 64 }, - "id": 124, + "id": 50, "links": [], "options": { "legend": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 300 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "9.2.0", - "targets": [ - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"udplite_inuse\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_inuse\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "UDP_inuse - Udp sockets currently in use", - "range": true, - "refId": "B", - "step": 240 + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.2.0", + "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_mem\"}", + "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "UDP_mem - Used memory for udp", + "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", "range": true, - "refId": "C", + "refId": "A", "step": 240 } ], - "title": "Sockstat UDP", + "title": "ICMP Errors", "type": "timeseries" }, { @@ -12607,7 +14639,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "sockets", + "axisLabel": "datagrams out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12636,7 +14668,6 @@ }, "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -12651,15 +14682,40 @@ }, "unit": "short" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] }, "gridPos": { "h": 10, "w": 12, "x": 0, - "y": 44 + "y": 74 }, - "id": 126, + "id": 55, "links": [], "options": { "legend": { @@ -12671,8 +14727,7 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 300 + "showLegend": true }, "tooltip": { "mode": "multi", @@ -12684,17 +14739,29 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"sockets_used\"}", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"InDatagrams\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "Sockets_used - Sockets currently in use", + "legendFormat": "InDatagrams - Datagrams received", "range": true, "refId": "A", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"OutDatagrams\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutDatagrams - Datagrams sent", + "range": true, + "refId": "B", + "step": 240 } ], - "title": "Sockstat Used", + "title": "UDP In / Out", "type": "timeseries" }, { @@ -12707,7 +14774,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "", + "axisLabel": "datagrams", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12736,16 +14803,19 @@ }, "links": [], "mappings": [], - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "kbytes" + "unit": "short" }, "overrides": [] }, @@ -12753,9 +14823,9 @@ "h": 10, "w": 12, "x": 12, - "y": 44 + "y": 74 }, - "id": 220, + "id": 109, "links": [], "options": { "legend": { @@ -12767,8 +14837,7 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 300 + "showLegend": true }, "tooltip": { "mode": "multi", @@ -12780,11 +14849,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"tcp_mem\"} * 4", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "mem_bytes - TCP sockets in that state", + "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", "range": true, "refId": "A", "step": 240 @@ -12792,11 +14861,11 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"udp_mem\"} * 4", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"NoPorts\"}[$__rate_interval])", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "mem_bytes - UDP sockets in that state", + "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", "range": true, "refId": "B", "step": 240 @@ -12804,15 +14873,29 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_sock_stat{instance=\"$node\",value=\"frag_mem\"} * 4", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"RcvbufErrors\"}[$__rate_interval])", + "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "FRAG_memory - Used memory for frag", + "legendFormat": "RcvbufErrors - UDP buffer errors received", "range": true, - "refId": "C" + "refId": "D", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"SndbufErrors\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "SndbufErrors - UDP buffer errors send", + "range": true, + "refId": "E", + "step": 240 } ], - "title": "Sockstat Memory Size", + "title": "UDP Errors", "type": "timeseries" }, { @@ -12825,7 +14908,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "octects out (-) / in (+)", + "axisLabel": "datagrams out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12880,6 +14963,18 @@ "value": "negative-Y" } ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] } ] }, @@ -12887,9 +14982,9 @@ "h": 10, "w": 12, "x": 0, - "y": 54 + "y": 84 }, - "id": 221, + "id": 299, "links": [], "options": { "legend": { @@ -12901,8 +14996,7 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 300 + "showLegend": true }, "tooltip": { "mode": "multi", @@ -12914,32 +15008,34 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_ip_count{instance=\"$node\",value=\"InReceives\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"InSegs\"}[$__rate_interval])", "format": "time_series", + "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "InOctets - Received octets", - "range": true, + "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", "refId": "A", "step": 240 }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_ip_count{instance=\"$node\",value=\"OutRequests\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"OutSegs\"}[$__rate_interval])", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "OutOctets - Sent octets", + "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", "range": true, "refId": "B", "step": 240 } ], - "title": "Netstat IP In / Out Octets", + "title": "TCP In / Out", "type": "timeseries" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -12948,7 +15044,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "datagrams", + "axisLabel": "counter", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -12998,9 +15094,9 @@ "h": 10, "w": 12, "x": 12, - "y": 54 + "y": 84 }, - "id": 81, + "id": 104, "links": [], "options": { "legend": { @@ -13012,8 +15108,7 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true, - "width": 300 + "showLegend": true }, "tooltip": { "mode": "multi", @@ -13025,17 +15120,61 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "rate(sysom_net_ip_count{instance=\"$node\",value=\"Forwarding\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"ListenOverflows\"}[$__rate_interval])", "format": "time_series", + "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "Forwarding - IP forwarding", + "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", "range": true, "refId": "A", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"ListenDrops\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", + "range": true, + "refId": "B", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_retrans_count{instance=\"$node\",value=\"syn_ack\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", + "range": true, + "refId": "C", + "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"RetransSegs\"}[$__rate_interval])", + "interval": "", + "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", + "range": true, + "refId": "D" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"InErrs\"}[$__rate_interval])", + "interval": "", + "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "range": true, + "refId": "E" } ], - "title": "Netstat IP Forwarding", + "title": "TCP Errors", "type": "timeseries" }, { @@ -13048,7 +15187,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "messages out (-) / in (+)", + "axisLabel": "connections", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -13077,6 +15216,7 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -13095,12 +15235,19 @@ { "matcher": { "id": "byRegexp", - "options": "/.*Out.*/" + "options": "/.*MaxConn *./" }, "properties": [ { - "id": "custom.transform", - "value": "negative-Y" + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 } ] } @@ -13110,9 +15257,9 @@ "h": 10, "w": 12, "x": 0, - "y": 64 + "y": 94 }, - "id": 115, + "id": 85, "links": [], "options": { "legend": { @@ -13136,33 +15283,23 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InMsgs\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"OutMsgs\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"CurrEstab\"}[$__rate_interval])", "format": "time_series", + "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", + "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", "range": true, - "refId": "B", + "refId": "A", "step": 240 } ], - "title": "ICMP In / Out", + "title": "TCP Connections", "type": "timeseries" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -13171,7 +15308,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "messages out (-) / in (+)", + "axisLabel": "counter", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -13200,6 +15337,7 @@ }, "links": [], "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -13214,28 +15352,15 @@ }, "unit": "short" }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Out.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 10, "w": 12, "x": 12, - "y": 64 + "y": 94 }, - "id": 50, + "id": 395, "links": [], "options": { "legend": { @@ -13259,21 +15384,23 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_icmp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", + "expr": "sysom_net_retrans_count{instance=\"$node\"}", "format": "time_series", + "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", + "legendFormat": "{{value}}", "range": true, "refId": "A", "step": 240 } ], - "title": "ICMP Errors", + "title": "TCP Retrans", "type": "timeseries" }, { "datasource": "sysom-prometheus", + "description": "", "fieldConfig": { "defaults": { "color": { @@ -13282,7 +15409,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "datagrams out (-) / in (+)", + "axisLabel": "counter out (-) / in (+)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -13329,19 +15456,7 @@ { "matcher": { "id": "byRegexp", - "options": "/.*Out.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Snd.*/" + "options": "/.*Sent.*/" }, "properties": [ { @@ -13356,9 +15471,9 @@ "h": 10, "w": 12, "x": 0, - "y": 74 + "y": 104 }, - "id": 55, + "id": 91, "links": [], "options": { "legend": { @@ -13382,11 +15497,12 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"InDatagrams\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesFailed\"}[$__rate_interval])", "format": "time_series", + "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "InDatagrams - Datagrams received", + "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", "range": true, "refId": "A", "step": 240 @@ -13394,19 +15510,53 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"OutDatagrams\"}[$__rate_interval])", + "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesRecv\"}[$__rate_interval])", "format": "time_series", + "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "OutDatagrams - Datagrams sent", + "legendFormat": "SyncookiesRecv - SYN cookies received", "range": true, "refId": "B", "step": 240 + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesSent\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesSent - SYN cookies sent", + "range": true, + "refId": "C", + "step": 240 } ], - "title": "UDP In / Out", + "title": "TCP SynCookie", "type": "timeseries" - }, + } + ], + "targets": [ + { + "datasource": "sysom-prometheus", + "refId": "A" + } + ], + "title": "Network Traffic and Sockstat and Netstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 427, + "panels": [ { "datasource": "sysom-prometheus", "fieldConfig": { @@ -13417,11 +15567,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "datagrams", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -13434,7 +15584,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -13444,7 +15594,6 @@ "mode": "off" } }, - "links": [], "mappings": [], "thresholds": { "mode": "absolute", @@ -13457,88 +15606,93 @@ "value": 80 } ] - }, - "unit": "short" + } }, - "overrides": [] + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "node-real", + "node-predict" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, - "x": 12, - "y": 74 + "x": 0, + "y": 7 }, - "id": 109, - "links": [], + "id": 435, "options": { "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", + "calcs": [], + "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" } }, - "pluginVersion": "9.2.0", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"InErrors\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", + "expr": "sysom_colocation_node_predict_rt{category=\"predict\",tag=\"ALL\",resource=\"CPU\",exported_instance=\"$node\"}", + "legendFormat": "node-{{category}}", "range": true, - "refId": "A", - "step": 240 + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"NoPorts\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", + "expr": "100-sysom_proc_cpu_total{mode=\"idle\",instance=\"$node\"}", + "hide": false, + "legendFormat": "node-real", "range": true, - "refId": "B", - "step": 240 + "refId": "B" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"RcvbufErrors\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "RcvbufErrors - UDP buffer errors received", + "expr": "sum(sysom_container_cpuacct_stat{bvt=\"LS\",instance=\"$node\",value=\"total\"})", + "hide": false, + "legendFormat": "ls-real", "range": true, - "refId": "D", - "step": 240 + "refId": "C" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_udp_count{instance=\"$node\",value=\"SndbufErrors\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "SndbufErrors - UDP buffer errors send", + "expr": "sysom_colocation_node_predict_rt{category=\"predict\",tag=\"LS\",resource=\"CPU\",exported_instance=\"$node\"}", + "hide": false, + "legendFormat": "ls-{{category}}", "range": true, - "refId": "E", - "step": 240 + "refId": "D" } ], - "title": "UDP Errors", + "title": "$node- CPU\u9884\u4f30\u8d44\u6e90VS\u5b9e\u9645\u8d44\u6e90", "type": "timeseries" }, { @@ -13551,11 +15705,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "datagrams out (-) / in (+)", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -13568,7 +15722,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -13578,7 +15732,6 @@ "mode": "off" } }, - "links": [], "mappings": [], "thresholds": { "mode": "absolute", @@ -13591,131 +15744,173 @@ "value": 80 } ] - }, - "unit": "short" + } }, "overrides": [ { + "__systemRef": "hideSeriesFrom", "matcher": { - "id": "byRegexp", - "options": "/.*Out.*/" + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "node-predict", + "node-real" + ], + "prefix": "All except:", + "readOnly": true + } }, "properties": [ { - "id": "custom.transform", - "value": "negative-Y" + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } } ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Snd.*/" - }, - "properties": [ + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 431, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_rt{category=\"predict\",tag=\"ALL\",resource=\"MEMORY\",exported_instance=\"$node\"}", + "legendFormat": "node-{{category}}", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "100 - sysom_proc_meminfo{value=\"MemAvailable\",instance=\"$node\"} / on(instance) sysom_proc_meminfo{value=\"MemTotal\",instance=\"$node\"} * 100", + "hide": false, + "legendFormat": "node-real", + "range": true, + "refId": "B" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "(100*sum(sysom_container_memUtil{bvt=\"LS\",value=\"usage\",instance=\"$node\"})/1024)/sum(sysom_proc_meminfo{value=\"MemTotal\",instance=\"$node\"})", + "hide": false, + "legendFormat": "ls-real", + "range": true, + "refId": "C" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_rt{category=\"predict\",tag=\"LS\",resource=\"MEMORY\",exported_instance=\"$node\"}", + "hide": false, + "legendFormat": "ls-{{category}}", + "range": true, + "refId": "D" + } + ], + "title": "$node - MEM\u9884\u4f30\u8d44\u6e90VS\u5b9e\u9645\u8d44\u6e90", + "type": "timeseries" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "custom.transform", - "value": "negative-Y" + "color": "green" + }, + { + "color": "red", + "value": 80 } ] } - ] + }, + "overrides": [] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, "x": 0, - "y": 84 + "y": 15 }, - "id": 299, - "links": [], + "id": 436, "options": { - "legend": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" + "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showUnfilled": true }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"InSegs\"}[$__rate_interval])", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", - "refId": "A", - "step": 240 + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"LS\", category=\"predict\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", + "range": true, + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"OutSegs\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"LS\", category=\"predict\",exported_instance=\"$node\", future=~\"..\"}", + "hide": false, + "legendFormat": "{{future}}", "range": true, - "refId": "B", - "step": 240 + "refId": "B" } ], - "title": "TCP In / Out", - "type": "timeseries" + "title": "$node - LS\u670d\u52a1CPU\u8d44\u6e90\u9884\u4f30(\u672a\u676524\u5c0f\u65f6)", + "type": "bargauge" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, - "links": [], "mappings": [], + "max": 100, "min": 0, "thresholds": { "mode": "absolute", @@ -13728,137 +15923,63 @@ "value": 80 } ] - }, - "unit": "short" + } }, "overrides": [] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, "x": 12, - "y": 84 + "y": 15 }, - "id": 104, - "links": [], + "id": 433, "options": { - "legend": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" + "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showUnfilled": true }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"ListenOverflows\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"LS\", category=\"predict\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", "range": true, - "refId": "A", - "step": 240 + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"ListenDrops\"}[$__rate_interval])", - "format": "time_series", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"LS\", category=\"predict\",exported_instance=\"$node\", future=~\"..\"}", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "irate(sysom_net_retrans_count{instance=\"$node\",value=\"syn_ack\"}[$__rate_interval])", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"RetransSegs\"}[$__rate_interval])", - "interval": "", - "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", - "range": true, - "refId": "D" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"InErrs\"}[$__rate_interval])", - "interval": "", - "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "legendFormat": "{{future}}", "range": true, - "refId": "E" + "refId": "B" } ], - "title": "TCP Errors", - "type": "timeseries" + "title": "$node - LS\u670d\u52a1MEM\u8d44\u6e90\u9884\u4f30(\u672a\u676524\u5c0f\u65f6)", + "type": "bargauge" }, { "datasource": "sysom-prometheus", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "connections", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, - "links": [], "mappings": [], + "max": 100, "min": 0, "thresholds": { "mode": "absolute", @@ -13871,115 +15992,133 @@ "value": 80 } ] - }, - "unit": "short" + } }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*MaxConn *./" - }, - "properties": [ + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 437, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.2.2", + "targets": [ + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"ALL\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"ALL\",exported_instance=\"$node\", future=~\"..\"}", + "hide": false, + "legendFormat": "{{future}}", + "range": true, + "refId": "B" + } + ], + "title": "$node - \u8282\u70b9CPU\u8d44\u6e90\u9884\u4f30(\u672a\u676524\u5c0f\u65f6)", + "type": "bargauge" + }, + { + "datasource": "sysom-prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } + "color": "green" }, { - "id": "custom.fillOpacity", - "value": 0 + "color": "red", + "value": 80 } ] } - ] + }, + "overrides": [] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, - "x": 0, - "y": 94 + "x": 12, + "y": 23 }, - "id": 85, - "links": [], + "id": 434, "options": { - "legend": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": -1, + "orientation": "vertical", + "reduceOptions": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" + "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showUnfilled": true }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_count{instance=\"$node\",value=\"CurrEstab\"}[$__rate_interval])", - "format": "time_series", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"ALL\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"ALL\",exported_instance=\"$node\",future=~\"..\"}", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "legendFormat": "{{future}}", "range": true, - "refId": "A", - "step": 240 + "refId": "B" } ], - "title": "TCP Connections", - "type": "timeseries" + "title": "$node - \u8282\u70b9MEM\u8d44\u6e90\u9884\u4f30(\u672a\u676524\u5c0f\u65f6)", + "type": "bargauge" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, - "links": [], "mappings": [], + "max": 100, "min": 0, "thresholds": { "mode": "absolute", @@ -13992,95 +16131,64 @@ "value": 80 } ] - }, - "unit": "short" + } }, "overrides": [] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, - "x": 12, - "y": 94 + "x": 0, + "y": 31 }, - "id": 395, - "links": [], + "id": 439, "options": { - "legend": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" + "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showUnfilled": true }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_net_retrans_count{instance=\"$node\"}", - "format": "time_series", + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"LS\", category=\"slack\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", + "range": true, + "refId": "A" + }, + { + "datasource": "sysom-prometheus", + "editorMode": "code", + "expr": "sysom_colocation_node_predict_future{resource=\"CPU\", tag=\"LS\", category=\"slack\",exported_instance=\"$node\", future=~\"..\"}", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{value}}", + "legendFormat": "{{future}}", "range": true, - "refId": "A", - "step": 240 + "refId": "B" } ], - "title": "TCP Retrans", - "type": "timeseries" + "title": "$node - LS\u670d\u52a1CPU-slack\u8d44\u6e90\u9884\u4f30", + "type": "bargauge" }, { "datasource": "sysom-prometheus", - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter out (-) / in (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "mode": "thresholds" }, - "links": [], "mappings": [], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -14092,102 +16200,56 @@ "value": 80 } ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Sent.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] } - ] + }, + "overrides": [] }, "gridPos": { - "h": 10, + "h": 8, "w": 12, - "x": 0, - "y": 104 + "x": 12, + "y": 31 }, - "id": 91, - "links": [], + "id": 438, "options": { - "legend": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { "calcs": [ - "mean", - "lastNotNull", - "max", - "min" + "lastNotNull" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "fields": "", + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showUnfilled": true }, - "pluginVersion": "9.2.0", + "pluginVersion": "9.2.2", "targets": [ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesFailed\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesRecv\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "SyncookiesRecv - SYN cookies received", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"LS\", category=\"slack\",exported_instance=\"$node\", future=~\".\"}", + "legendFormat": "{{future}}", "range": true, - "refId": "B", - "step": 240 + "refId": "A" }, { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "irate(sysom_net_tcp_ext_count{instance=\"$node\",value=\"SyncookiesSent\"}[$__rate_interval])", - "format": "time_series", + "expr": "sysom_colocation_node_predict_future{resource=\"MEMORY\", tag=\"LS\", category=\"slack\",exported_instance=\"$node\", future=~\"..\"}", "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "SyncookiesSent - SYN cookies sent", + "legendFormat": "{{future}}", "range": true, - "refId": "C", - "step": 240 + "refId": "B" } ], - "title": "TCP SynCookie", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": "sysom-prometheus", - "refId": "A" + "title": "$node - LS\u670d\u52a1MEMORY-slack\u8d44\u6e90\u9884\u4f30", + "type": "bargauge" } ], - "title": "Network Traffic and Sockstat and Netstat", + "title": "Resource Evaluation", "type": "row" } ], @@ -14221,8 +16283,8 @@ { "current": { "selected": false, - "text": "127.0.0.1:8400", - "value": "127.0.0.1:8400" + "text": "192.168.10.149:8400", + "value": "192.168.10.149:8400" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_proc_meminfo, instance)", @@ -14248,7 +16310,7 @@ ] }, "time": { - "from": "now-1h", + "from": "now-24h", "to": "now" }, "timepicker": { @@ -14279,7 +16341,7 @@ "timezone": "browser", "title": "sysom_base", "uid": "rYdddlPWk", - "version": 4, + "version": 5, "weekStart": "" } } \ No newline at end of file diff --git a/docker/sysom-init.service b/docker/sysom-init.service index 96c390f5e5f58d9718acc66c6847c0ea3bb423e1..4abdbc4e1061aacf1d5f8f0d59a1543cebdf0c37 100644 --- a/docker/sysom-init.service +++ b/docker/sysom-init.service @@ -1,10 +1,12 @@ [Unit] Description=Init sysom After=mariadb.service +Before=supervisord [Service] Type=oneshot -ExecStart=bash -x /root/sysom/script/server/init.sh +WorkingDirectory=/usr/local/sysom/init_scripts +ExecStart=bash -x ./sysom.sh init ALL [Install] WantedBy=multi-user.target \ No newline at end of file diff --git a/docker/sysom_dockerfile b/docker/sysom_base_dockerfile similarity index 75% rename from docker/sysom_dockerfile rename to docker/sysom_base_dockerfile index be9d2a0fbcbd56b1e65e25188c8435e5621efb52..b75442ff694559e936c4c8b4cf9f37a3cd16d801 100644 --- a/docker/sysom_dockerfile +++ b/docker/sysom_base_dockerfile @@ -1,4 +1,14 @@ -FROM openanolis/anolisos:8.8 + +FROM node:16.20.1 as web_builder +COPY sysom_web /root/sysom_web +WORKDIR /root/sysom_web +RUN npm config set registry https://registry.npmmirror.com +RUN yarn config set registry https://registry.npmmirror.com +RUN yarn +RUN yarn build + + +FROM openanolis/anolisos:8.8 as prod # Add epel RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm @@ -28,11 +38,11 @@ COPY deps /root/sysom/deps COPY environment /root/sysom/environment COPY sysom_server /root/sysom/sysom_server -COPY sysom_web/dist /usr/local/sysom/web +COPY --from=web_builder /root/sysom_web/dist /usr/local/sysom/web RUN bash -x /root/sysom/script/sysom.sh install deps ALL RUN bash -x /root/sysom/script/sysom.sh install env ALL -RUN bash -x /root/sysom/script/sysom.sh install ms sysom_api,sysom_diagnosis,sysom_channel,sysom_monitor_server,sysom_migration +RUN bash -x /root/sysom/script/sysom.sh install ms sysom_api,sysom_diagnosis,sysom_channel,sysom_monitor_server,sysom_log,sysom_alarm,sysom_cmg RUN yum clean all diff --git a/docker/sysom_api_dockerfile b/docker/sysom_base_lite_dockerfile similarity index 38% rename from docker/sysom_api_dockerfile rename to docker/sysom_base_lite_dockerfile index 4eaa0b9c240b05c5fcb8b06743f12dcd78c23136..df9a713ca1ca91a89df9d7e4a45960319b56f1b1 100644 --- a/docker/sysom_api_dockerfile +++ b/docker/sysom_base_lite_dockerfile @@ -1,4 +1,14 @@ -FROM openanolis/anolisos:8.8 + +FROM node:16.20.1 as web_builder +COPY sysom_web /root/sysom_web +WORKDIR /root/sysom_web +RUN npm config set registry https://registry.npmmirror.com +RUN yarn config set registry https://registry.npmmirror.com +RUN yarn +RUN yarn build + + +FROM openanolis/anolisos:8.8 as prod # Add epel RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm @@ -9,23 +19,34 @@ RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" RUN yum makecache RUN yum install -y supervisor cronie net-tools RUN systemctl enable crond -# RUN systemctl enable supervisord +RUN yum install -y python3 +RUN yum install -y nginx +RUN yum install -y wget +RUN systemctl enable supervisord +RUN systemctl enable nginx # Init sysom-diagnosis ARG SYSOM_HOME=/usr/local/sysom ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server RUN mkdir /root/sysom +RUN mkdir -p /usr/local/sysom COPY conf /root/sysom/conf COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice +COPY deps /root/sysom/deps +COPY environment /root/sysom/environment +COPY sysom_server /root/sysom/sysom_server -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_api -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf +COPY --from=web_builder /root/sysom_web/dist /usr/local/sysom/web + +RUN bash -x /root/sysom/script/sysom.sh install deps nginx +RUN bash -x /root/sysom/script/sysom.sh install env ALL +RUN bash -x /root/sysom/script/sysom.sh install ms sysom_api,sysom_diagnosis,sysom_channel,sysom_monitor_server,sysom_log,sysom_cmg RUN yum clean all +COPY docker/sysom-init.service /usr/lib/systemd/system/sysom-init.service +RUN systemctl enable sysom-init.service + # # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file +ENTRYPOINT [ "/usr/sbin/init" ] \ No newline at end of file diff --git a/docker/sysom_channel_dockerfile b/docker/sysom_channel_dockerfile deleted file mode 100644 index 70e34b9f6e30fa5bf2e3bb75a7475abf3dbfc8ac..0000000000000000000000000000000000000000 --- a/docker/sysom_channel_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_channel -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_diagnosis_dockerfile b/docker/sysom_diagnosis_dockerfile deleted file mode 100644 index 5197206457001b4e730ed5f90fff5d6dff539be5..0000000000000000000000000000000000000000 --- a/docker/sysom_diagnosis_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_diagnosis -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_hotfix_dockerfile b/docker/sysom_hotfix_dockerfile deleted file mode 100644 index ffc3106a6647e25b346c46623d17a413c4c8b134..0000000000000000000000000000000000000000 --- a/docker/sysom_hotfix_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools rpcbind nfs-utils -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_hotfix -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_migration_dockerfile b/docker/sysom_migration_dockerfile deleted file mode 100644 index e34ec3d20ecd72894efb4d52b7b52ff39cdc887d..0000000000000000000000000000000000000000 --- a/docker/sysom_migration_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools wget -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_migration -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_monitor_server_dockerfile b/docker/sysom_monitor_server_dockerfile deleted file mode 100644 index a930cc609bee8f1b44ff22efcc68cce58a07efb2..0000000000000000000000000000000000000000 --- a/docker/sysom_monitor_server_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_monitor_server -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_vmcore_dockerfile b/docker/sysom_vmcore_dockerfile deleted file mode 100644 index 6bbe95481eb22c124fe0db4f5ef678cf81250781..0000000000000000000000000000000000000000 --- a/docker/sysom_vmcore_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools rpcbind nfs-utils -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_vmcore -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docker/sysom_vul_dockerfile b/docker/sysom_vul_dockerfile deleted file mode 100644 index 378fb79bd707fb457e00312d79aa59ec49f27da5..0000000000000000000000000000000000000000 --- a/docker/sysom_vul_dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openanolis/anolisos:8.8 - -# Add epel -RUN yum install -y https://mirrors.aliyun.com/epel/epel-release-latest-8.noarch.rpm -RUN bash -c "sed -i 's|^#baseurl=https://download.example/pub|baseurl=https://mirrors.aliyun.com|' /etc/yum.repos.d/epel*" -RUN bash -c "sed -i 's|^metalink|#metalink|' /etc/yum.repos.d/epel*" - -# Add required yum packages -RUN yum makecache -RUN yum install -y supervisor cronie net-tools rpcbind nfs-utils -RUN systemctl enable crond -# RUN systemctl enable supervisord - -# Init sysom-diagnosis -ARG SYSOM_HOME=/usr/local/sysom -ARG SYSOM_SERVER_HOME=${SYSOM_HOME}/server - -RUN mkdir /root/sysom -COPY conf /root/sysom/conf -COPY script /root/sysom/script -COPY infrastructure /root/sysom/infrastructure -COPY microservice /root/sysom/microservice - -RUN bash -x /root/sysom/script/sysom.sh deploy infrastructure env,sdk -RUN bash -x /root/sysom/script/sysom.sh deploy microservice sysom_vul -RUN sed "s/nodaemon=false/nodaemon=true/g" -i /etc/supervisord.conf - -RUN yum clean all - -# # 环境准备 -ENTRYPOINT ["/usr/bin/supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/docs/develop_guide.md b/docs/develop_guide.md index b40b6fe9cb92a2ce42491537281b988e1fd968d4..bd97740f6310a75e3e17ded6c43eb658f43dd88b 100644 --- a/docs/develop_guide.md +++ b/docs/develop_guide.md @@ -302,4 +302,28 @@ for msg in consumer: # 使用示例 => 下列命令会一键创建一个名为 sysom_demo 的微服务,并监听在 7010 端口 ./sysom.sh create server demo 7010 -``` \ No newline at end of file +``` + + +### 3.1 已有微服务以及端口占用情况 +| **微服务名** | **占用端口** | +| -------------- | ----------------- | +| sysom_api | 7001 | +| sysom_diagnosis | 7002 | +| sysom_channel | 7003 | +| sysom_vmcore | 7004 | +| sysom_vul | 7005 | +| sysom_migration | 7006 | +| sysom_hotfix | 7007 | +| sysom_monitor_server | 7009 | +| sysom_log | 7010 | +| sysom_cec_proxy | 7011 | +| sysom_alarm | 7012 | +| sysom_rca | 7013 | +| sysom_ad_proxy | 7014 | +| sysom_knowledge | 7015 | +| sysom_hotfix_builder | 7016 | +| sysom_metric_anomaly_detection | 7017 | +| sysom_alert_pusher | 7018 | +| sysom_dingtalk | 7019 | +| sysom_cluster_health | 7020 | \ No newline at end of file diff --git a/environment/0_env/requirements.txt b/environment/0_env/requirements.txt index fab44e461d26933677238948e6afec072b86338f..8f2b516c798cdc4c9423bf0b49db8e2f49b930f3 100644 --- a/environment/0_env/requirements.txt +++ b/environment/0_env/requirements.txt @@ -6,6 +6,7 @@ aiofiles==0.8.0 anyio==3.6.2 aiohttp==3.8.4 asyncer==0.0.2 +sqlalchemy==1.4.49 # asyncssh==2.12.0 # autopep8==2.0.0 # channels==3.0.4 @@ -37,4 +38,6 @@ uvicorn[standard]==0.16.0 # xlrd==2.0.1 # prometheus-client==0.16.0 pyyaml==6.0 -pyyaml-include==1.3 \ No newline at end of file +pyyaml-include==1.3 +psutil==5.9.7 +confluent-kafka==1.9.0 diff --git a/environment/1_sdk/cec_kafka/__init__.py b/environment/1_sdk/cec_kafka/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..17cd0f5c027a627564ec97293b82d35bbeb03134 --- /dev/null +++ b/environment/1_sdk/cec_kafka/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/7 21:16 +Author: zhangque (Wardenjohn) +Email ydzhang@linux.alibaba.com +File __init__.py.py +Description: +""" +name = "cec_kafka" diff --git a/environment/1_sdk/cec_kafka/admin_static.py b/environment/1_sdk/cec_kafka/admin_static.py new file mode 100644 index 0000000000000000000000000000000000000000..3b555d4c093552e058635d630bb111ca52a8b004 --- /dev/null +++ b/environment/1_sdk/cec_kafka/admin_static.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- # +""" +Time 2022/9/26 21:13 +Author: mingfeng (SunnyQjm), zhangque (Wardenjohn) +Email mfeng@linux.alibaba.com, ydzhang@linux.alibaba.com +File admin_static.py +Description: static call method +""" +import sys +from typing import Optional, List +from itertools import chain +from clogger import logger +from cec_base.exceptions import TopicNotExistsException, \ + TopicAlreadyExistsException, ConsumerGroupNotExistsException, \ + ConsumerGroupAlreadyExistsException +from cec_base.meta import TopicMeta, PartitionMeta, ConsumerGroupMeta, \ + ConsumerGroupMemberMeta +from cec_base.exceptions import CecException +from cec_base.url import CecUrl +from .utils import raise_if_not_ignore +from .consume_status_storage import ConsumeStatusStorage +from .common import StaticConst +from confluent_kafka.admin import AdminClient, NewTopic +from confluent_kafka import Consumer as ConfluentKafkaConsumer +from confluent_kafka import Consumer, TopicPartition +import confluent_kafka +from confluent_kafka.error import KafkaException, KafkaError + + +#################################################################### +# Static function implementation of the management interface +#################################################################### +def static_create_topic(kafka_admin_client: AdminClient, + topic_name: str = "", num_partitions: int = 1, + replication_factor: int = 1, + ignore_exception: bool = False, + expire_time: int = 24 * 60 * 60 * 1000) -> bool: + try: + res = kafka_admin_client.create_topics( + [NewTopic(topic_name, num_partitions, replication_factor)]) + res.get(topic_name).result() + except KafkaException as ke: + if ke.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: + return raise_if_not_ignore(ignore_exception, + TopicAlreadyExistsException( + f"Topic {topic_name} already " + f"exists." + )) + else: + return raise_if_not_ignore(ignore_exception, ke) + except Exception as e: + return raise_if_not_ignore(ignore_exception, e) + return True + +def static_del_topic(kafka_admin_client: AdminClient, topic_name: str, + ignore_exception: bool = False): + """static method of deleting one topic + + this method of deleting one topic can be invoked by static method + + Args: + kafka_admin_client (AdminClient): _description_ + topic_name (str): _description_ + ignore_exception (bool, optional): _description_. Defaults to False. + + Returns: + _type_: _description_ + """ + try: + res = kafka_admin_client.delete_topics([topic_name]) + res.get(topic_name).result() + except KafkaException as ke: + if ke.args[0].code() == KafkaError.UNKNOWN_TOPIC_OR_PART: + return raise_if_not_ignore(ignore_exception, + TopicNotExistsException( + f"Someone else is creating or deleting " + f"this topic." + )) + else: + return raise_if_not_ignore(ignore_exception, ke) + except Exception as e: + return raise_if_not_ignore(ignore_exception, e) + return True + +def static_is_topic_exist(kafka_admin_client: AdminClient, + topic_name: str) -> bool: + class_meta = kafka_admin_client.list_topics(topic_name) + return class_meta.topics.get(topic_name).error is None + +def static_get_topic_list(kafka_admin_client: AdminClient) -> [TopicMeta]: + class_meta = kafka_admin_client.list_topics() + res = [] + for topic in class_meta.topics.values(): + new_topic = TopicMeta(topic.topic) + new_topic.error = topic.error + for p_key, p_value in topic.partitions.items(): + new_topic.partitions[p_key] = PartitionMeta(p_value.id) + res.append(new_topic) + return res + +def static_create_consumer_group(kafka_admin_client: AdminClient, + url: CecUrl, consumer_group_id: str, + ignore_exception: bool = False + ): + + if KafkaAdmin.static_is_consumer_group_exist(kafka_admin_client, + consumer_group_id): + return raise_if_not_ignore( + ignore_exception, ConsumerGroupAlreadyExistsException( + f"Consumer group {consumer_group_id} already exists.")) + + _kafka_consumer_client = Consumer({ + 'bootstrap.servers': url.netloc, + "request.timeout.ms": 600000, + 'group.id': consumer_group_id, + **url.params, + }) + try: + _kafka_consumer_client.subscribe(['__consumer_offsets']) + _kafka_consumer_client.poll(0.1) + except Exception as e: + return raise_if_not_ignore( + ignore_exception, e) + return True + +def static_is_consumer_group_exist( + kafka_admin_client: AdminClient, + consumer_group_id: str) -> bool: + return len(kafka_admin_client.list_groups(consumer_group_id)) > 0 + +def static_get_consumer_group_list( + kafka_admin_client: AdminClient + ) -> [ConsumerGroupMeta]: + groups = kafka_admin_client.list_groups() + res = [] + for group in groups: + new_group = ConsumerGroupMeta(group.id) + new_group.error = group.error + for member in group.members: + new_group.members.append( + ConsumerGroupMemberMeta(member.client_id)) + res.append(new_group) + return res + diff --git a/environment/1_sdk/cec_kafka/common.py b/environment/1_sdk/cec_kafka/common.py new file mode 100644 index 0000000000000000000000000000000000000000..aa19b96c9696008245708c2b1d68b9c159adfa2f --- /dev/null +++ b/environment/1_sdk/cec_kafka/common.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- # +""" +Time 2022/7/29 13:33 +Author: Zhangque (Wardenjohn) +Email ydzhang@linux.alibaba.com +File common.py +Description: Static variable +""" + +from cec_base.url import CecUrl + +class StaticConst: + """Static consts + + This class defines all the static constant values in the cec-redis module + """ + + # List of specialization parameters + KAFKA_SPECIAL_PARM_CEC_BATCH_CONSUMER_LIMIT = "batch_consume_limit" + KAFKA_SPECIAL_PARM_CEC_CONSUME_TIMEOUT = "kafka_consume_event_timeout" + # KAFKA_CONNECTTION_PARAMS_ **** + + """ + The special parameter supported by Kafka + """ + _kafka_special_parameter_list = [ + # batch_consume_limit => the number of event to consume in one batch + # 1. Effective range:[Consumer] + # 2. Meaning: This parameter specifies the number of event of one + # consumer consume at one time. For some case of Kafka, + # if the number of event not reach the batch_consume_limit, + # the consumer process may block. + KAFKA_SPECIAL_PARM_CEC_BATCH_CONSUMER_LIMIT, + # kafka_consume_event_timeout => The time to wait for an event + # 1. Effective range:[Consumer] + # 2. Meaning: This parameter specifies the time to wait for an event. + # If the consumer wait for an event too long to reach the + # limit of this timeout, consumer will stop to wait more + # event even though the number of message still not reach + # the batch_consume_limie. + KAFKA_SPECIAL_PARM_CEC_CONSUME_TIMEOUT + ] + + _kafka_special_parameters_default_value = { + KAFKA_SPECIAL_PARM_CEC_BATCH_CONSUMER_LIMIT: (int, 1), + KAFKA_SPECIAL_PARM_CEC_CONSUME_TIMEOUT: (int, 1) + } + + """ + The following _kafka_connection_parameter_list is used to generate the supported + parameter supported by the connection process to kafka server. + _kafka_connection_parameters_default_value is used to generate the default value + of kafka server connection. + """ + + _kafka_connection_parameter_list = [ + + ] + + _kafka_connection_parameters_default_value = { + + } + + @staticmethod + def parse_special_parameter(params: dict) -> dict: + """Parse specialization parameters + + Parse the specialization parameters and remove the specialization + parameters from the parameter list + + Args: + params(dict): CecUrl.params + + Returns: + + """ + res = {} + for key in StaticConst._kafka_special_parameter_list: + _type, default = \ + StaticConst._kafka_special_parameters_default_value[key] + res[key] = _type(params.pop(key, default)) + return res + + @staticmethod + def parse_kafka_connection_params(params: dict) -> dict: + """Parse kafka connection parameters + + Args: + params(dict): CecUrl.params + + Returns: + params(dict) + """ + res = {} + for key in StaticConst._kafka_connection_parameter_list: + res[key] = params[key] + return res + + +class ClientBase: + """ + cec-kafka client base class, which provides some generic implementation + """ + + def __init__(self, url: CecUrl): + self._redis_version = None + self._special_params = StaticConst.parse_special_parameter(url.params) + + def get_special_param(self, key: str, default=''): + """Get specialization parameter by key + + Args: + key(str): specialization parameter key + default(Any): default value if key not exists + + Returns: + + """ + return self._special_params.get(key, default) diff --git a/environment/1_sdk/cec_kafka/kafka_admin.py b/environment/1_sdk/cec_kafka/kafka_admin.py new file mode 100644 index 0000000000000000000000000000000000000000..06753bc5af58c8ddd3e84ff7bffc96fef17a0194 --- /dev/null +++ b/environment/1_sdk/cec_kafka/kafka_admin.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/11 20:20 +Author: mingfeng (SunnyQjm), zhangque (Wardenjohn) +Email mfeng@linux.alibaba.com, ydzhang@linux.alibaba.com +File kafka_admin.py +Description: Kafka Admin. For Kafka manage object. +""" +import json +import uuid + +from cec_base.admin import Admin, ConsumeStatusItem +from cec_base.exceptions import TopicNotExistsException, TopicAlreadyExistsException +from cec_base.exceptions import ConsumerGroupAlreadyExistsException, CecException +from cec_base.event import Event +from cec_base.url import CecUrl +from cec_base.meta import TopicMeta, PartitionMeta, ConsumerGroupMeta, \ + ConsumerGroupMemberMeta +from clogger import logger +from confluent_kafka.admin import AdminClient, NewTopic +from confluent_kafka import Consumer as ConfluentKafkaConsumer +from confluent_kafka import Consumer, TopicPartition +import confluent_kafka +from confluent_kafka.error import KafkaException, KafkaError +from .utils import raise_if_not_ignore +from .common import StaticConst, ClientBase +class KafkaAdmin(Admin): + """This is a kafka-based execution module implement of Admin + + KafkaAdmin contain the function of managing topic and group, etc. + + https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/list_offsets.py + """ + + _EVENT_KEY_KAFKA_CONSUMER_MESSAGE = "_EVENT_KEY_KAFKA_CONSUMER_MESSAGE" + + """__init__ for object KafkaAdmin + Args: + url: CecUrl Object. CecUrl contains cec url format definition + """ + def __init__(self, url: CecUrl) -> None: + super().__init__() + self._kafka_admin_client: AdminClient = None + self._kafka_consumer_client: ConfluentKafkaConsumer = None + self._current_url: str = "" + self._current_cec_url: CecUrl= url + url.params = StaticConst.parse_kafka_connection_params(url.params) + self.connect_by_cec_url(url) + + @staticmethod + def static_create_topic(kafka_admin_client: AdminClient, + topic_name: str = "", num_partitions: int = 1, + replication_factor: int = 1, + ignore_exception: bool = False, + expire_time: int = 24 * 60 * 60 * 1000) -> bool: + try: + res = kafka_admin_client.create_topics( + [NewTopic(topic_name, num_partitions, replication_factor)]) + res.get(topic_name).result() + except KafkaException as ke: + if ke.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: + return raise_if_not_ignore(ignore_exception, + TopicAlreadyExistsException( + f"Topic {topic_name} already " + f"exists." + )) + else: + return raise_if_not_ignore(ignore_exception, ke) + except Exception as e: + return raise_if_not_ignore(ignore_exception, e) + return True + + def create_topic(self, topic_name: str = "", num_partitions: int = 1, + replication_factor: int = 1, + ignore_exception: bool = False, + expire_time: int = 24 * 60 * 60 * 1000) -> bool: + """Create one topic to Kafka Server + + Args: + topic_name: the unique identifier of the topic + num_partitions: the number of the partitions + 1. This parameter set how many partitions should be devided of the data from one topic. + This data will be store in different cluster blocker. + 2. If the underlying message middleware supports partitioning (such as Kafka), + partitioning can be done based on this configuration; + 3. If the underlying message middleware do not support partitioning (such as Redis), + ignore this parameter will be enough (one partition is enough). You can use Admin.is_support_partitions() + to judge if the underlying message midddleware is support this feature or not. + + replication_factor: set how many copy of this topic data + + 1. 该参数制定了在分布式集群部署的场景下,同一个主题的分区存在副本的数量,如果 replication_factor == 1 + 则表示主题下的所有分区都只有一个副本,一旦丢失不可回复; + 2. 如果底层的消息中间件支持数据副本,则可以依据该配置进行对应的设置; + 3. 如果底层的消息中间件不支持数据副本,则忽略该参数即可(即认定只有一个副本即可),可以通过 + Admin.is_support_replication() 方法判定当前使用的小心中间件实现是否支持该特性; + + ignore_exception: 是否忽略可能会抛出的异常 + expire_time: 事件超时时间(单位:ms,默认:1day) + + 1. 该参数指定了目标 Topic 中每个事件的有效期; + 2. 一旦一个事件的加入到 Topic 的时间超过了 expire_time,则cec不保证该事件 + 的持久性,cec应当在合适的时候删除超时的事件; + 3. 不强制要求超时的事件被立即删除,可以对超时的事件进行周期性的清理。 + + Returns: + bool: True if successful, False otherwise. + + Raises: + TopicAlreadyExistsException: If topic already exists + + Examples: + >>> admin = dispatch_admin("kafka://localhost:6379") + >>> admin.create_topic("test_topic") + True + """ + return KafkaAdmin.static_create_topic( + self._kafka_admin_client, + topic_name, num_partitions, replication_factor, ignore_exception, + expire_time + ) + + @staticmethod + def static_del_topic(kafka_admin_client: AdminClient, topic_name: str, + ignore_exception: bool = False): + """static method of deleting one topic + + this method of deleting one topic can be invoked by static method + + Args: + kafka_admin_client (AdminClient): _description_ + topic_name (str): _description_ + ignore_exception (bool, optional): _description_. Defaults to False. + + Returns: + _type_: _description_ + """ + try: + res = kafka_admin_client.delete_topics([topic_name]) + res.get(topic_name).result() + except KafkaException as ke: + if ke.args[0].code() == KafkaError.UNKNOWN_TOPIC_OR_PART: + return raise_if_not_ignore(ignore_exception, + TopicNotExistsException( + f"Someone else is creating or deleting " + f"this topic." + )) + else: + return raise_if_not_ignore(ignore_exception, ke) + except Exception as e: + return raise_if_not_ignore(ignore_exception, e) + return True + + def del_topic(self, topic_name: str, + ignore_exception: bool = False) -> bool: + """Delete one topic + + 删除一个 Topic => 对应到 Kafka 应该是删除一个 Topic + + Args: + topic_name: 主题名字(主题的唯一标识) + ignore_exception: 是否忽略可能会抛出的异常 + + Returns: + bool: True if successful, False otherwise. + + Raises: + TopicNotExistsException: If topic not exists + + Examples: + >>> admin = dispatch_admin("kafka://localhost:6379") + >>> admin.del_topic("test_topic") + True + """ + return KafkaAdmin.static_del_topic(self._kafka_admin_client, + topic_name, ignore_exception) + + @staticmethod + def static_is_topic_exist(kafka_admin_client: AdminClient, + topic_name: str) -> bool: + class_meta = kafka_admin_client.list_topics(topic_name) + return class_meta.topics.get(topic_name).error is None + + def is_topic_exist(self, topic_name: str) -> bool: + """Judge whether one specific topic is exists + + 判断 Topic 是否存在 => 对应到 Kafka 应该是判断是否存最对应topic + 1. 通过 list_topics 接口并指定 topic,判断返回的 Topic 是否有效进行判断 + + Args: + topic_name: 主题名字(主题的唯一标识) + + Returns: + bool: True if topic exists, False otherwise. + + Examples: + >>> admin = dispatch_admin("kafka://localhost:9092") + >>> admin.is_topic_exist("test_topic") + True + """ + return KafkaAdmin.static_is_topic_exist(self._kafka_admin_client, + topic_name) + + @staticmethod + def static_get_topic_list(kafka_admin_client: AdminClient) -> [TopicMeta]: + class_meta = kafka_admin_client.list_topics() + res = [] + for topic in class_meta.topics.values(): + new_topic = TopicMeta(topic.topic) + new_topic.error = topic.error + for p_key, p_value in topic.partitions.items(): + new_topic.partitions[p_key] = PartitionMeta(p_value.id) + res.append(new_topic) + return res + + def get_topic_list(self) -> [TopicMeta]: + """Get topic list + + 获取 Topic 列表 => 对应到 Redis 应该是获取所有 Topic 的列表 + + Args: + + Returns: + [str]: The topic name list + + Examples: + >>> admin = dispatch_admin("kafka://localhost:6379") + >>> admin.get_topic_list() + [TopicMeta(faeec676-60db-4418-a775-c5f1121d5331, 1)] + """ + return KafkaAdmin.static_get_topic_list(self._kafka_admin_client) + + @staticmethod + def static_create_consumer_group(kafka_admin_client: AdminClient, + url: CecUrl, consumer_group_id: str, + ignore_exception: bool = False + ): + + if KafkaAdmin.static_is_consumer_group_exist(kafka_admin_client, + consumer_group_id): + return raise_if_not_ignore( + ignore_exception, ConsumerGroupAlreadyExistsException( + f"Consumer group {consumer_group_id} already exists.")) + + _kafka_consumer_client = Consumer({ + 'bootstrap.servers': url.netloc, + "request.timeout.ms": 600000, + 'group.id': consumer_group_id, + **url.params, + }) + try: + _kafka_consumer_client.subscribe(['__consumer_offsets']) + _kafka_consumer_client.poll(0.1) + except Exception as e: + return raise_if_not_ignore( + ignore_exception, e) + return True + + def create_consumer_group(self, consumer_group_id: str, + ignore_exception: bool = False) -> bool: + """Create one consumer group + + 创建一个消费组 + 1. __consumer_offsets 是 kafka 中用来传递消费者偏移量的特殊 Topic; + 2. 可以通过构建一个 Consumer,定语 __consumer_offsets 这个特殊主题,并且 + 指定消费组 ID,则指定的消费组就会被正常创建。 + + Args: + consumer_group_id: 消费组ID,应当具有唯一性 + ignore_exception: 是否忽略可能会抛出的异常 + + Returns: + bool: True if successful, False otherwise. + + Raises: + ConsumerGroupAlreadyExistsException: If consumer group already + exists + + Examples: + >>> admin = dispatch_admin("kafka://localhost:6379") + >>> admin.create_consumer_group("test_group") + True + """ + res = KafkaAdmin.static_create_consumer_group( + self._kafka_admin_client, self._current_cec_url, consumer_group_id, + ignore_exception) + return res + + def del_consumer_group(self, consumer_group_id: str, + ignore_exception: bool = False) -> bool: + raise CecException( + "Not implement del_consumer_group for current proto") + + @staticmethod + def static_is_consumer_group_exist( + kafka_admin_client: AdminClient, + consumer_group_id: str) -> bool: + return len(kafka_admin_client.list_groups(consumer_group_id)) > 0 + + def is_consumer_group_exist(self, consumer_group_id: str) -> bool: + return KafkaAdmin.static_is_consumer_group_exist( + self._kafka_admin_client, + consumer_group_id + ) + + @staticmethod + def static_get_consumer_group_list( + kafka_admin_client: AdminClient + ) -> [ConsumerGroupMeta]: + groups = kafka_admin_client.list_groups() + res = [] + for group in groups: + new_group = ConsumerGroupMeta(group.id) + new_group.error = group.error + for member in group.members: + new_group.members.append( + ConsumerGroupMemberMeta(member.client_id)) + res.append(new_group) + return res + + def get_consumer_group_list(self) -> [ConsumerGroupMeta]: + return KafkaAdmin.static_get_consumer_group_list( + self._kafka_admin_client + ) + + def get_consume_status(self, topic: str, consumer_group_id: str = "", + partition: int = 0) -> [ConsumeStatusItem]: + """Get consumption info for specific + + 获取特定消费者组对某个主题下的特定分区的消费情况,应包含以下数据 + 1. 最小ID(最小 offset) + 2. 最大ID(最大 offset) + 3. 分区中存储的事件总数(包括已消费的和未消费的) + 4. 最后一个当前消费组在该分区已确认的事件ID(最后一次消费者确认的事件的ID) + 5. 分区的消息堆积数量 LAG(已经提交到该分区,但是没有被当前消费者消费或确认的事件数量) + + Args: + topic: 主题名字 + consumer_group_id: 消费组ID + 1. 如果 consumer_group_id 为空字符串或者None,则返回订阅了该主题的所有 + 消费组的消费情况;=> 此时 partition 参数无效(将获取所有分区的消费数据) + 2. 如果 consumer_group_id 为无效的组ID,则抛出异常; + 3. 如果 consumer_group_id 为有效的组ID,则只获取该消费组的消费情况。 + partition: 分区ID + 1. 如果 partition 指定有效非负整数 => 返回指定分区的消费情况 + 2. 如果 partition 指定无效非负整数 => 抛出异常 + 3. 如果 partition 指定负数 => 返回当前主题下所有分区的消费情况 + + Raises: + CecException + + References: + https://github.com/confluentinc/confluent-kafka-python/blob/master/examples/list_offsets.py + + Returns: + + """ + + def _inner_get_consume_status( + c: ConfluentKafkaConsumer, group_id: str, tp: TopicPartition + ): + (lo, hi) = c.get_watermark_offsets(tp, + timeout=10, + cached=False) + if tp.offset == confluent_kafka.OFFSET_INVALID: + offset = "-" + else: + offset = f"{tp.partition}-{tp.offset - 1}" + if hi < 0: + lag = "no hwmark" # Unlikely + elif tp.offset < 0: + lag = hi - lo + else: + lag = hi - tp.offset + + return ConsumeStatusItem( + topic, group_id, tp.partition, min_id=f"{tp.partition}-{lo}", + max_id=f"{tp.partition}-{hi - 1}", total_event_count=hi - lo, last_ack_id=offset, lag=lag + ) + + if consumer_group_id != "" and consumer_group_id is not None: + # 获取指定消费组 + consumer = ConfluentKafkaConsumer({ + 'bootstrap.servers': self._current_cec_url.netloc, + 'group.id': consumer_group_id + }) + if partition >= 0: + # 获取指定分区 + committed = consumer.committed( + [TopicPartition(topic, partition)], timeout=10) + return [_inner_get_consume_status( + consumer, consumer_group_id, committed[0])] + else: + # 获取所有分区 + committed = consumer.committed([partition], timeout=10) + return [ + _inner_get_consume_status(consumer, consumer_group_id, tp) + for tp in committed] + else: + # 获取所有消费组的消费情况 + groups = self.get_consumer_group_list() + res = [] + for group in groups: + consumer = ConfluentKafkaConsumer({ + 'bootstrap.servers': self._current_cec_url.netloc, + 'group.id': group.group_id + }) + metadata = consumer.list_topics(topic, timeout=10) + partitions = [confluent_kafka.TopicPartition(topic, p) for p in + metadata.topics[topic].partitions] + committed = consumer.committed(partitions, timeout=10) + for tp in committed: + if tp.offset == confluent_kafka.OFFSET_INVALID: + continue + else: + res.append( + _inner_get_consume_status(consumer, group.group_id, + tp)) + return res + + def get_event_list(self, topic: str, partition: int, offset: str, + count: int) -> [Event]: + """ Get event list for specific + + 获取特定主题在指定分区下的消息列表 + 1. offset 和 count 用于分页 + + Args: + topic: 主题名字 + partition: 分区ID + offset: 偏移(希望读取在该 ID 之后的消息) + count: 最大读取数量 + + Returns: + + """ + consumer = ConfluentKafkaConsumer({ + 'bootstrap.servers': self._current_cec_url.netloc, + 'group.id': uuid.uuid4() + }) + tp = TopicPartition(topic, partition) + if offset == '-': + tp.offset = 0 + else: + tp.offset = int(offset.split('-')[-1]) + 1 + + consumer.assign([tp]) + kafka_messages = consumer.consume(count, timeout=10) + messages = [] + for message in kafka_messages: + event = Event(json.loads(message.value().decode('utf-8')), + f"{message.partition()}-{message.offset()}") + event.put(KafkaAdmin._EVENT_KEY_KAFKA_CONSUMER_MESSAGE, message) + messages.append(event) + consumer.unassign() + return messages + + def is_support_partitions(self) -> bool: + return True + + def is_support_replication(self) -> bool: + return True + + def connect_by_cec_url(self, url: CecUrl): + """Connect to Kafka server by CecUrl + + Args: + url(str): CecUrl + """ + configs = { + 'bootstrap.servers': url.netloc, + "request.timeout.ms": 600000, + **url.params + } + """ + AdminClient is for Kafka Admin Object. + This Object is init by a dictionary. + confs = { + 'bootstrap.servers': url.netloc, # the location of the server + "request.timeout.ms": 600000, + 'sasl.mechanisms': "", #optional, usually PLAIN + 'ssl.ca.location': "", #optional, the patch to certs + 'security.protocol': "", #optional, the security protocal, eg. SASL_SSL + 'sasl.username': "", # optional, provide it if Kafka server need it + 'sasl.password': "" # optional, provide it if Kafka server need it + } + """ + self._kafka_admin_client = AdminClient(conf=configs) + self._current_url = url.__str__() + + def connect(self, url: str): + """Connect to Kafka server by url + + Args: + url(str): CecUrl + """ + cec_url = CecUrl.parse(url) + return self.connect_by_cec_url(cec_url) + + def disconnect(self): + self._kafka_admin_client = None + diff --git a/environment/1_sdk/cec_kafka/kafka_consumer.py b/environment/1_sdk/cec_kafka/kafka_consumer.py new file mode 100644 index 0000000000000000000000000000000000000000..7a258f89a48e1a14c0b8f17548eb64598c298374 --- /dev/null +++ b/environment/1_sdk/cec_kafka/kafka_consumer.py @@ -0,0 +1,225 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/11 20:20 +Author: mingfeng (SunnyQjm), zhangque (Wardenjohn) +Email mfeng@linux.alibaba.com, ydzhang@linux.alibaba.com +File kafka_admin.py +Description: Kafka Consumer. +""" +import json +import uuid + +from cec_base.consumer import Consumer +from cec_base.event import Event +from cec_base.url import CecUrl +from confluent_kafka import Consumer as ConfluentKafkaConsumer +from confluent_kafka import TopicPartition +from clogger import logger +from queue import Queue +from .common import StaticConst, ClientBase + +class KafkaConsumer(Consumer, ClientBase): + """A Kafka Based Consumer + + Description: + KafkaConsumer._EVENT_KEY_KAFKA_CONSUMER_MESSAGE is an key. + This key is use for internal message storage. An Event from consumer get from Kafka server + will be put into the dicrionary in Event object. Use Event.get(key) can get the event object + for futher operations. + """ + + _EVENT_KEY_KAFKA_CONSUMER_MESSAGE = "_EVENT_KEY_KAFKA_CONSUMER_MESSAGE" + + def __init__(self, url: CecUrl, topic_name: str, consumer_id: str = "", + group_id: str = "", start_from_now: bool = True, + default_batch_consume_limit: int = 5): + super().__init__(topic_name, consumer_id, group_id, start_from_now, + default_batch_consume_limit=default_batch_consume_limit) + ClientBase.__init__(self, url) + self._current_url = "" + # save the original special params + orig_special_params = url.params + + self._batch_consume_limit = self.get_special_param( + StaticConst.KAFKA_SPECIAL_PARM_CEC_BATCH_CONSUMER_LIMIT + ) + self._timeout = self.get_special_param( + StaticConst.KAFKA_SPECIAL_PARM_CEC_CONSUME_TIMEOUT + ) + + # before connect to cec by url, filter the parameter kafka not support + url.params = StaticConst.parse_kafka_connection_params(url.params) + + # set auto-offset-reset = earliest if you want to consume message + # from the start + if group_id != "" or not start_from_now: + url.params['auto.offset.reset'] = 'earliest' + else: + url.params['auto.offset.reset'] = 'latest' + + url.params['client.id'] = consumer_id + + self._current_url = url + self._kafka_consumer_client: ConfluentKafkaConsumer = None + # if group_id is set, use the given group id or generate one random group id + url.params['group.id'] = uuid.uuid4() if group_id == "" else group_id + + self.connect_by_cec_url(url) + self._kafka_consumer_client.subscribe([topic_name], on_assign=self.on_assign) + self._last_event_id: str = None # save the id of lastest consume message + self._message_cache_queue = Queue() # Queue of message cache + + def on_assign(self, consumer: ConfluentKafkaConsumer, + partitions: [TopicPartition]): + """on_assign + + Args: + consumer (ConfluentKafkaConsumer): _description_ + partitions (TopicPartition]): _description_ + """ + print(partitions) + pass + + def consume(self, timeout: int = -1, auto_ack: bool = False, + batch_consume_limit: int = -1) -> [Event]: + """consume + + Args: + timeout (int, optional): timeout setting (ms). Defaults to -1. If timeout <=0 ,it means block to wait + auto_ack (bool, optional): set to auto ack an message, which is effect to group consume. Defaults to False. + 1. If auto_ack switch to open, every event read by consumer will be ack automaticlly. + 2. The caller should make sure this event will be correctly handled + batch_consume_limit (int, optional): The limitation of batch consume. Defaults to 0. For some cases of Kafka, + if the message do not achieve the consume limit batch, the consume process will block and wait. + timeout (int, optional): The timeout limit for consumer to wait. timeout is set default 1. Becase in some cases of + Kafka, if the number of message do not reach the batch_consume_limit, the consume process will block. This + parameter should be set in association with batch_consume_limit. + + + Returns: + [Event]: _description_ + Description: + + """ + batch_consume_limit = self._batch_consume_limit if batch_consume_limit <= 0 else batch_consume_limit + timeout = self._timeout if timeout <=0 else timeout + kafka_message = self._kafka_consumer_client.consume( + batch_consume_limit, + timeout/1000 + ) + # message is a list, which contain Event objects + message_list: [Event] = [] + for message in kafka_message: + try: + event = Event(json.loads(message.value().decode('utf-8')), + f"{message.partition()}-{message.offset()}") + except Exception as e: + # if exception is raised, it means the message is not in the json format(event not in dict format) + event = Event(message.value().decode('utf-8'), + f"{message.partition()}-{message.offset()}") + # event => Event: Object + # event.put => event._cache: {key:_EVENT_KEY, value:message} + event.put(KafkaConsumer._EVENT_KEY_KAFKA_CONSUMER_MESSAGE, message) + message_list.append(event) + if auto_ack: + self._kafka_consumer_client.commit(message) + return message_list + + def ack(self, event: Event) -> int: + """Ack one event + + Args: + event (Event): _description_ + + Returns: + int: _description_ + + Description: + """ + message = event.get(KafkaConsumer._EVENT_KEY_KAFKA_CONSUMER_MESSAGE) + ret = False + if message is not None: + # this is an asynchronous commit + self._kafka_consumer_client.commit(message) + ret = True + return ret + + def __getitem__(self, item): + """get the next message from queue + + Args: + item (_type_): _description_ + + Description: + overwite __getitem__ + get next message from _message_cache_queue. + if _message_cache_queue is empty, try to consume event from server + then put it into the _message_cache_queue for iterator + """ + msg=None + if not self._message_cache_queue.empty(): + msg = self._message_cache_queue.get() + else: + for new_msg in self.consume(): + self._message_cache_queue.put(new_msg) + if not self._message_cache_queue.empty(): + msg = self._message_cache_queue.get() + return msg + + def connect_by_cec_url(self, url: CecUrl): + """Connect to Kafka server by CecUrl + + Args: + url(str): CecUrl + """ + self._kafka_consumer_client = ConfluentKafkaConsumer({ + 'bootstrap.servers': url.netloc, + "request.timeout.ms": 600000, + **url.params + }) + self._current_url = url.__str__() + return self + + def connect(self, url:str): + """connetct to server + First use CecUrl.parse to generate an CecUrl Object from the + given url. + Args: + url (str): This url is a string. + Contains protocal, netlocation, params, etc. + + Returns: + _type_: KafkaConsumer + + + """ + cec_url = CecUrl.parse(url) + return self.connect_by_cec_url(cec_url) + + def disconnect(self): + """Disconnect from Kafka server + """ + if self._kafka_consumer_client is None: + return + + self._kafka_consumer_client.close() + self._kafka_consumer_client = None + + def __next__(self): + msg = None + try: + if not self._message_cache_queue.empty(): + msg = self._message_cache_queue.get() + else: + # consume msg from remote server + for new_msg in self.consume(): + self._message_cache_queue.put(new_msg) + if not self._message_cache_queue.empty(): + msg = self._message_cache_queue.get() + except Exception as e: + pass + finally: + if msg is None: + raise StopIteration() + return msg + \ No newline at end of file diff --git a/environment/1_sdk/cec_kafka/kafka_producer.py b/environment/1_sdk/cec_kafka/kafka_producer.py new file mode 100644 index 0000000000000000000000000000000000000000..c37a8ce6e79649099f8f9dc3cb60bac0234d94e2 --- /dev/null +++ b/environment/1_sdk/cec_kafka/kafka_producer.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/12 11:07 +Author: mingfeng (SunnyQjm), zhangque (Wardenjohn) +Email mfeng@linux.alibaba.com, ydzhang@linux.alibaba.com +File kafka_producer.py +Description: The producer of Kafka +""" +import json +from typing import Callable, Union + +from cec_base.producer import Producer +from cec_base.event import Event +from cec_base.exceptions import TopicNotExistsException, CecException +from cec_base.url import CecUrl +from clogger import logger +from confluent_kafka import Producer as ConfluentKafkaProducer +from confluent_kafka.cimpl import Message +from .common import StaticConst, ClientBase +class KafkaProducer(Producer): + """ A Kafka Based Producer + """ + + _EVENT_KEY_KAFKA_PRODUCER_MESSAGE = "_EVENT_KEY_KAFKA_PRODUCER_MESSAGE" + + def __init__(self, url: CecUrl, auto_mk_topic: bool = False, **kwargs) -> None: + self._current_url = "" + if 'default_max_len' in kwargs: + self.default_max_len = kwargs['default_max_len'] + else: + self.default_max_len = 1000 + #self.auto_mk_topic = auto_mk_topic + #url.params['allow.auto.create.topics'] = auto_mk_topic + # 1. define ConfluentKafkaProducer type object of _kafka_producer_client + # 2. use connect_by_cec_url to connect to kafka server + self._kafka_producer_client: ConfluentKafkaProducer = None + url.params = StaticConst.parse_kafka_connection_params(url.params) + self.connect_by_cec_url(url) + self._current_cec_url = url + + # create an dict, to save the reflaction from topic_name => TopicMeta + self._topic_metas = { + + } + + def produce(self, topic_name: str, message_value: dict, callback: Callable[[Exception, Event], None] = None, + partition: int = -1, + **kwargs): + """Produce method + Generate one event and push it into the event center + + Args: + topic_name (str): _description_ + message_value (dict): _description_ + callback (Callable[[Exception, Event], None], optional): _description_. Defaults to None. + partition (int): + 1. if partition is passed, event will be sent to this partition (not recommand) + 2. if a positive partition number is passed, but this partition is not exsit, an exception is raised. + 3. if a negative partition number is passed, The message will be evenly delivered to all partitions + using the built-in strategy (recommended) + Examples: + >>> producer = dispatch_producer( + ..."kafka://localhost:6379?password=123456") + >>> producer.produce("test_topic", {"value": "hhh"}) + """ + def deal_callback(err, msg: Message, value: dict, cb: Callable[[Exception, Event], None] = None): + """deal callback + + Args: + err (_type_): _description_ + msg (Message): _description_ + value (dict): _description_ + cb (Callable[[Exception, Event, None]], optional): _description_. Defaults to None. + + Description: + + """ + if msg.error() is not None: + err = msg.error() + if err is not None: + cb(CecException(err), None) + event = Event(value, f"{msg.partition()}-{msg.offset()}") + event.put(KafkaProducer._EVENT_KEY_KAFKA_PRODUCER_MESSAGE, msg) + if cb is not None: + cb(None, event) + + topic_exist = False + + if topic_name not in self._topic_metas or self._topic_metas[topic_name].error is not None: + self._topic_metas[topic_name] = self._kafka_producer_client.list_topics(topic_name).topics[topic_name] + if self._topic_metas[topic_name].error is None: + topic_exist = True + else: + topic_exist = True + + if not topic_exist: + callback(TopicNotExistsException( + f"{self} Topic ({topic_name}) not exists."), None + ) + return + + params = { + **kwargs, + 'callback': lambda err, msg: deal_callback( + err, msg, message_value, callback + ) + } + # 指定分区处理 + if partition >= 0: + params['partition'] = partition + + self._kafka_producer_client.produce( + topic_name, json.dumps(message_value), + **params + ) + + def flush(self, timeout = -1): + """flush all cache event to kafka server immediatly + + Args: + timout (int, optional): _description_. Defaults to -1. + + Description: + this function is used to flush all cache event to server. + """ + if timeout <= 0: + timeout = -1 + else: + timeout = timeout / 1000 + self._kafka_producer_client.flush(timeout) + + def connect_by_cec_url(self, cec_url: CecUrl): + """connect to kafka server with the given cec url + + Args: + cec_url (CecUrl): An object generated by the configured cec url + """ + self._kafka_producer_client = ConfluentKafkaProducer({ + 'bootstrap.servers': cec_url.netloc, + 'request.timeout.ms': 600000, + **cec_url.params + }) + # CecUrl.__str__() is overwritten, return the url string + self._current_url = cec_url.__str__() + return self + + def connect(self, url: str): + cec_url = CecUrl.parse(url) + return self.connect_by_cec_url(cec_url=cec_url) + + def __del__(self): + self.disconnect() + + def disconnect(self): + if self._kafka_producer_client is None: + return + self._kafka_producer_client = None + print("disconnect with kafka server") \ No newline at end of file diff --git a/environment/1_sdk/cec_kafka/utils.py b/environment/1_sdk/cec_kafka/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..80fab63816bbcebf2b8201f64f77b4f6eb324be0 --- /dev/null +++ b/environment/1_sdk/cec_kafka/utils.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/12 17:13 +Author: mingfeng (SunnyQjm), zhangque (Wardenjohn) +Email mfeng@linux.alibaba.com, ydzhang@linux.alibaba.com +File utils.py +Description: Utils of cec_kafka +""" +from cec_base.url import CecUrl +from clogger import logger + +def raise_if_not_ignore(is_ignore_exception: bool, exception: Exception): + """Raise or ignore for specific exception + + Args: + is_ignore_exception: Is ignore exception while `exception` be raised. + exception: The exception want to check + """ + if is_ignore_exception: + # If you choose to ignore the exception, the ignored exception is + # logged in the log as an exception + logger.exception(exception) + return False + raise exception \ No newline at end of file diff --git a/environment/1_sdk/cec_redis/redis_producer.py b/environment/1_sdk/cec_redis/redis_producer.py index adc2dbe45509da75fce77f61484e187cf7945d24..137840e24bb57df7435a59e7421754e252f80a9f 100644 --- a/environment/1_sdk/cec_redis/redis_producer.py +++ b/environment/1_sdk/cec_redis/redis_producer.py @@ -151,8 +151,7 @@ class RedisProducer(Producer, ClientBase): f"Topic ({topic_name}) not exists.") else: logger.info( - f"{self} produce one message '{event_id}'=>" - f"{message_value} successfully." + f"{self} produce one message '{event_id}'" ) if callback is not None: diff --git a/environment/1_sdk/cmg_redis/redis_service_discovery.py b/environment/1_sdk/cmg_redis/redis_service_discovery.py index dd903d70493b029270cf4272b0fd16fe6de08101..a195d112636b57ae601e87df69d678b7c0da1110 100644 --- a/environment/1_sdk/cmg_redis/redis_service_discovery.py +++ b/environment/1_sdk/cmg_redis/redis_service_discovery.py @@ -82,4 +82,16 @@ class RedisServiceDiscovery(ServiceDiscovery, ClientBase): ) # Update strategy if possible self.get_instances(service_name, force) - return self._strategy_map[service_name].select() + selected = self._strategy_map[service_name].select() + try_count = 0 + while selected is None and try_count < 5: + self.get_instances(service_name, True) + selected = self._strategy_map[service_name].select() + try_count += 1 + time.sleep(0.1) + if selected is None: + raise Exception( + f"RedisServiceDiscovery: No available instance for " + f"service_name => {service_name}" + ) + return selected diff --git a/environment/1_sdk/gcache_base/gcache.py b/environment/1_sdk/gcache_base/gcache.py index 48123d40495b09172932bd965dcf890ddfa20731..67c56a4861b1e4643779b464303f62398905b2eb 100644 --- a/environment/1_sdk/gcache_base/gcache.py +++ b/environment/1_sdk/gcache_base/gcache.py @@ -8,7 +8,7 @@ Description: """ import importlib from abc import ABCMeta, abstractmethod -from typing import Union, Optional, Dict +from typing import Union, Optional, Dict, List from threading import Lock from clogger import logger from .exceptions import GCacheProtoAlreadyExistsException, \ @@ -39,6 +39,20 @@ class GCache(metaclass=ABCMeta): """ pass + + @abstractmethod + def push_list(self, key: str, value: Union[int, float, dict, str], + front: int = 0) -> int: + pass + + @abstractmethod + def pop_list(self, key: str, front: int = 0) -> Union[int, float, dict, str]: + pass + + @abstractmethod + def get_list(self, key: str, start: int = 0, + end: int = -1) -> List[Union[None, int, float, dict, str]]: + pass @abstractmethod def load(self, key: str) -> Union[None, int, float, dict, str]: @@ -47,7 +61,11 @@ class GCache(metaclass=ABCMeta): @abstractmethod def load_all(self) -> Dict[str, Union[int, float, dict, str]]: pass - + + @abstractmethod + def delete_list(self, key: str) -> bool: + pass + @abstractmethod def delete(self, key: str) -> bool: """ diff --git a/environment/1_sdk/gcache_redis/redis_gcache.py b/environment/1_sdk/gcache_redis/redis_gcache.py index 431e4e5247917414bed3f2d9ae9652141d4b9e3e..6e3aea407c1ffc4a352db416c0a15849f8091bbe 100644 --- a/environment/1_sdk/gcache_redis/redis_gcache.py +++ b/environment/1_sdk/gcache_redis/redis_gcache.py @@ -7,9 +7,10 @@ File redis_gcache.py Description: """ import json -from typing import Union, Dict +from typing import Union, Dict, List from gcache_base import GCache, GCacheUrl, GCacheException from redis_lua import XRedisHashTable +from clogger import logger from .common import ClientBase, StaticConst SEPARATOR = "_GCache_" @@ -60,6 +61,49 @@ class RedisGCache(GCache, ClientBase): expire=expire ) + def push_list(self, key: str, value: Union[int, float, dict, str], + front: int = 0) -> int: + try: + if front not in [0, 1]: + raise GCacheException(f"Got not supported front = {front}, expect one of [0, 1]") + + method = self.redis_client.lpush if front else self.redis_client.rpush + res = method(f"{self._table_name}:{key}", + f"{self._get_store_value(value)}") + return res + except Exception as e: + logger.exception(e) + return -1 + + def pop_list(self, key: str, front: int = 0) -> Union[None, int, float, dict, str]: + try: + actual_key = f"{self._table_name}:{key}" + if front: + res = self.redis_client.lpop(actual_key) + elif front == 0: + res = self.redis_client.rpop(actual_key) + else: + raise GCacheException( + f"Got not supported front = {front}, expect one of [0, 1]" + ) + if res is None: + return None + return self._get_format_value(res) + except Exception as e: + logger.exception(e) + return None + + def get_list(self, key: str, start: int = 0, + end: int = -1) -> List[Union[None, int, float, dict, str]]: + try: + res = self.redis_client.lrange(f"{self._table_name}:{key}",start, end) + if res is None: + return [] + return [self._get_format_value(r) for r in res] + except Exception as e: + logger.exception(e) + return [] + def _get_format_value(self, value: str) -> Union[None, int, float, dict, str]: type_value = value.split(SEPARATOR) if len(type_value) < 2: @@ -92,6 +136,15 @@ class RedisGCache(GCache, ClientBase): def clean(self): self._x_redis_hash_table.hdrop_table(self._table_name) + self._x_redis_hash_table.hdrop_list(self._table_name) + + def delete_list(self, key: str) -> bool: + try: + self.redis_client.delete(key) + return True + except Exception as e: + logger.exception(e) + return False def delete(self, key: str) -> bool: return self._x_redis_hash_table.hdel(self._table_name, key) diff --git a/environment/1_sdk/metric_reader/metric_reader.py b/environment/1_sdk/metric_reader/metric_reader.py index c0528e4477afccd21753f59a347a3fb5d43cbb3c..31cfd42a25e9e33d6496f1b49cb1a2baa6d7d9f2 100644 --- a/environment/1_sdk/metric_reader/metric_reader.py +++ b/environment/1_sdk/metric_reader/metric_reader.py @@ -13,7 +13,7 @@ from .result import MetricResult from .url import MetricReaderUrl from .exceptions import MetricReaderProtoAlreadyExistsException, \ MetricReaderProtoNotExistsException, MetricReaderException -from .task import RangeQueryTask +from .task import RangeQueryTask, InstantQueryTask from .common import StaticConst @@ -60,6 +60,29 @@ class MetricReader(metaclass=ABCMeta): Returns: """ + + @abstractmethod + def get_label_values(self, label_name) -> MetricResult: + """Get values list of specific label_name + + Args: + label_name: + + Returns: + + """ + + @abstractmethod + def instant_query(self, queries: List[InstantQueryTask]) -> MetricResult: + """Query data using query API for a specified metric with promql aggregation function + + Args: + queries([InstantQueryTask]): Query tasks + + Returns: + RangeVectorResult + """ + pass @abstractmethod def range_query(self, queries: List[RangeQueryTask]) -> MetricResult: diff --git a/environment/1_sdk/metric_reader/opentsdb_metric_reader.py b/environment/1_sdk/metric_reader/opentsdb_metric_reader.py index dca92f0bc52639d5ed94b23d5f89680361792d9b..aea284b316b318861eed5e5fbadf3320d1b95c35 100644 --- a/environment/1_sdk/metric_reader/opentsdb_metric_reader.py +++ b/environment/1_sdk/metric_reader/opentsdb_metric_reader.py @@ -70,6 +70,9 @@ class OpentsdbMetricReader(MetricReader): else: mr.data = res.json() return mr + + def get_label_values(self, label_name: str) -> List[str]: + pass def range_query(self, queries: List[RangeQueryTask]) -> MetricResult: params = { diff --git a/environment/1_sdk/metric_reader/prometheus_metric_reader.py b/environment/1_sdk/metric_reader/prometheus_metric_reader.py index 31f8e2dc3afdfe110be7a405403c843bfa79ba17..5bb9d3044292ee021bb456ba941887d2b93730d7 100644 --- a/environment/1_sdk/metric_reader/prometheus_metric_reader.py +++ b/environment/1_sdk/metric_reader/prometheus_metric_reader.py @@ -8,18 +8,24 @@ Description: """ import requests from typing import List +from enum import Enum from urllib.parse import urljoin -from .result import MetricResult, RangeVectorResult +from .result import * from .metric_reader import MetricReader from .url import MetricReaderUrl -from .task import RangeQueryTask +from .task import RangeQueryTask, QueryTask, InstantQueryTask from .filter import FilterType from .common import StaticConst GET_LABEL_NAMES = "/api/v1/series" RANGE_QUERY_API = "/api/v1/query_range" +QUERY_API = "/api/v1/query" METRIC_METADATA = "/api/v1/metadata" +GET_LABEL_VALUE = "/api/v1/label" +class QueryType(Enum): + instant = 0 + range = 1 class PrometheusMetricReader(MetricReader): def __init__(self, url: MetricReaderUrl, **kwargs): @@ -32,6 +38,49 @@ class PrometheusMetricReader(MetricReader): def _get_url(self, api: str): return urljoin(self.base_url, api) + def _get_basic_promql_query(self, task: QueryTask): + promql_str = task.metric_name + rules = [] + if task.filters is not None and len(task.filters) > 0: + for flt in task.filters: + if flt.filter_type == FilterType.Equal: + rules.append(f'{flt.label_name}="{flt.value}"') + elif flt.filter_type == FilterType.Wildcard: + rules.append(f'{flt.label_name}=~' + f'"{flt.value.replace("*", "(.*?)")}"') + promql_str = promql_str + "{" + ",".join(rules) + "}" + return promql_str + + def _parse_res(self, query_type, res) -> MetricResult: + mr = MetricResult(0, []) + if res.status_code != 200: + mr.code = 1 + mr.err_msg = "Request failed, status_code != 200" + else: + json_res = res.json() + if json_res["status"] != "success": + mr.code = 1 + mr.err_msg = f"Prometheus API error: {json_res['error']}" + else: + if query_type == QueryType.range: + mr.data = [ + RangeVectorResult(item["metric"]["__name__"], + item["metric"], + values=item["values"]) + for item in json_res["data"]["result"] + ] + elif query_type == QueryType.instant: + for item in json_res["data"]["result"]: + metric_name = "" + if "__name__" in item["metric"]: + metric_name = item["metric"]["__name__"] + mr.data.append( + InstantVectorResult(metric_name, + item["metric"], + value=item["value"]) + ) + return mr + def get_metric_names(self, limit: int = -1) -> MetricResult: params = {} if limit > 0: @@ -71,44 +120,75 @@ class PrometheusMetricReader(MetricReader): mr.data = list(series[0].keys()) return mr - def range_query(self, queries: List[RangeQueryTask]) -> MetricResult: - def get_promql_query(task: RangeQueryTask): - promql_str = task.metric_name - rules = [] - if task.filters is not None and len(task.filters) > 0: - for flt in task.filters: - if flt.filter_type == FilterType.Equal: - rules.append(f'{flt.label_name}="{flt.value}"') - elif flt.filter_type == FilterType.Wildcard: - rules.append(f'{flt.label_name}=~' - f'"{flt.value.replace("*", "(.*?)")}"') - promql_str = promql_str + "{" + ",".join(rules) + "}" - return promql_str + def get_label_values(self, label_name) -> MetricResult: + url = GET_LABEL_VALUE + "/" + label_name + "/values" + res = requests.get(self._get_url(url)) + mr = MetricResult(0, []) + if res.status_code != 200: + mr.code = 1 + mr.err_msg = f"Get values for {label_name} failed!" + else: + json_res = res.json() + if json_res["status"] != "success": + mr.code = 1 + mr.err_msg = (f"Get label values for" + f" {label_name} failed => {json_res['error']}") + else: + values = json_res["data"] + if len(values) > 0: + mr.data = values + return mr + + def instant_query(self, queries: List[InstantQueryTask]) -> MetricResult: + def query_one(task: InstantQueryTask): + basic_query = self._get_basic_promql_query(task) + """ + range vector aggregation function also need to use query api + example one: curl -g 'http://localhost:9090/api/v1/query? + query=avg_over_time(sysom_cgroups[5m])&time=1696679657.796' + + example two: curl -g 'http://localhost:9090/api/v1/query? + query=topk(3, sysom_cgroups[5m])&time=1696679657.796' + """ + if task.aggregation is not None: + if task.interval is not None: + interval = f"[{task.interval}]" + basic_query = basic_query + interval + if task.aggregation_val is not None: + basic_query = f"{task.aggregation_val}, " + basic_query + + basic_query = task.aggregation + "(" + basic_query + ")" + if task.clause_label is not None: + basic_query = basic_query + task.clause \ + + "(" + ",".join(task.clause_label) + ")" + + res = requests.get(self._get_url(QUERY_API), { + "query": basic_query, + "time": task.time, + }) + + return self._parse_res(QueryType.instant, res) + + merged_result = MetricResult(0, []) + for query in queries: + mr_result = query_one(query) + if mr_result.code != 0: + merged_result.code = mr_result.code + merged_result.err_msg = mr_result.err_msg + break + merged_result.data.extend(mr_result.data) + return merged_result + def range_query(self, queries: List[RangeQueryTask]) -> MetricResult: def query_one(task: RangeQueryTask): res = requests.get(self._get_url(RANGE_QUERY_API), { - "query": get_promql_query(task), + "query": self._get_basic_promql_query(task), "start": task.start_time, "end": task.end_time, "step": f"{task.step}s" }) - mr = MetricResult(0, []) - if res.status_code != 200: - mr.code = 1 - mr.err_msg = "Request failed, status_code != 200" - else: - json_res = res.json() - if json_res["status"] != "success": - mr.code = 1 - mr.err_msg = f"Prometheus API error: {json_res['error']}" - else: - mr.data = [ - RangeVectorResult(item["metric"]["__name__"], - item["metric"], - values=item["values"]) - for item in json_res["data"]["result"] - ] - return mr + + return self._parse_res(QueryType.range, res) merged_result = MetricResult(0, []) for query in queries: diff --git a/environment/1_sdk/metric_reader/result.py b/environment/1_sdk/metric_reader/result.py index ed3a8c1b3d63e11f2e56bfdcc3a9d2139996835d..3fedebca52edb940298cf751f5be919766bd9c12 100644 --- a/environment/1_sdk/metric_reader/result.py +++ b/environment/1_sdk/metric_reader/result.py @@ -25,10 +25,26 @@ class RangeVectorResult: def to_dict(self): return dict(self) +class InstantVectorResult: + def __init__(self, metric_name: str, labels: Dict[str, str], + value: Tuple[float, float]): + self.metric_name = metric_name + self.labels = labels + self.value = value + + def keys(self): + return ('metric_name', 'labels', 'value') + + def __getitem__(self, item): + return getattr(self, item) + + def to_dict(self): + return dict(self) + class MetricResult: def __init__(self, code: int, - data: Union[List[RangeVectorResult], List[str]], + data: Union[List[RangeVectorResult], List[InstantVectorResult], List[str]], err_msg: str = ""): self.code = code self.err_msg = err_msg @@ -43,6 +59,8 @@ class MetricResult: if isinstance(obj, List) and len(obj) > 0: if isinstance(obj[0], RangeVectorResult): return [item.to_dict() for item in obj] + if isinstance(obj[0], InstantVectorResult): + return [item.to_dict() for item in obj] else: return obj else: diff --git a/environment/1_sdk/metric_reader/task.py b/environment/1_sdk/metric_reader/task.py index d178ad231bb0043ab1b9b9e7386c0febbe8c384d..4df06602c53efbbe2b0bce19cfc3100983242aec 100644 --- a/environment/1_sdk/metric_reader/task.py +++ b/environment/1_sdk/metric_reader/task.py @@ -44,7 +44,6 @@ class QueryTask: self.filters.append(EqualFilter(label_name, value)) return self - class RangeQueryTask(QueryTask): """ Args: @@ -60,3 +59,29 @@ class RangeQueryTask(QueryTask): self.start_time = start_time self.end_time = end_time self.step = step + +class InstantQueryTask(QueryTask): + """ + Args: + time(float): Instant time (Unix timestamp) + aggregation(str):Built-in aggregation function of promql + (e.g. avg_over_time, rate, sum) + interval(str): Range of time use to get a range vector(e.g. 5m) + """ + def __init__( + self, + metric_name: str, + time: float, + aggregation: str = None, + interval: str = None, + aggregation_val: str = None, + filters: List[Filter] = None, + clause_label: List[str] = None, + ): + super().__init__(metric_name, filters) + self.time = time + self.aggregation = aggregation + self.interval = interval + self.clause = "by" + self.clause_label = clause_label + self.aggregation_val = aggregation_val diff --git a/environment/1_sdk/redis_lua/x_hdrop_list.lua b/environment/1_sdk/redis_lua/x_hdrop_list.lua new file mode 100644 index 0000000000000000000000000000000000000000..9ad4b9cfc7aeb7761a1d256f91fe7413cab630de --- /dev/null +++ b/environment/1_sdk/redis_lua/x_hdrop_list.lua @@ -0,0 +1,9 @@ +local cursor = 0 +repeat + local result = redis.call('SCAN', cursor, 'MATCH', KEYS[1].."*", 'COUNT', 1000) + cursor = tonumber(result[1]) + local keys = result[2] + for i = 1, #keys do + redis.call('DEL', keys[i]) + end +until cursor == 0 \ No newline at end of file diff --git a/environment/1_sdk/redis_lua/xreadis_hash_table.py b/environment/1_sdk/redis_lua/xreadis_hash_table.py index 92092368adefd544654188691cc7f5ec42a7fc70..a9a31eff0d955bbf5cea89d94cafd451c21f1cfb 100644 --- a/environment/1_sdk/redis_lua/xreadis_hash_table.py +++ b/environment/1_sdk/redis_lua/xreadis_hash_table.py @@ -90,6 +90,10 @@ class XRedisHashTable: res = self._evalsha("x_hdrop_table", 2, table_name, self._get_expire_table(table_name)) return res == "OK" + + def hdrop_list(self, table_name: str) -> bool: + res = self._evalsha("x_hdrop_list", 1, table_name) + return res == "OK" def hdel(self, table_name: str, *fields: str) -> bool: res = self._evalsha("x_hdel", 2, table_name, diff --git a/environment/1_sdk/setup_sysom_utils.py b/environment/1_sdk/setup_sysom_utils.py index 902b880b20a09aa27d8582ecd0b91f97d90a6262..bd289b810a01ce8652b8aedc262394381c76c249 100644 --- a/environment/1_sdk/setup_sysom_utils.py +++ b/environment/1_sdk/setup_sysom_utils.py @@ -24,7 +24,8 @@ setuptools.setup( "aiofiles==0.8.0", "anyio>=3.6.2", "asyncer==0.0.2", - "fastapi==0.83.0" + "fastapi==0.83.0", + "psutil==5.9.7" ], classifiers=[ "Programming Language :: Python :: 3", diff --git a/environment/1_sdk/sysom_utils/adddict.py b/environment/1_sdk/sysom_utils/adddict.py index 1fedea331a3d684e6566594f0c8fc5154a9af09a..d82c09f800fb7fd7a415ce9ded49f6d7db9d62ef 100644 --- a/environment/1_sdk/sysom_utils/adddict.py +++ b/environment/1_sdk/sysom_utils/adddict.py @@ -7,6 +7,7 @@ File __init__.py.py Description: """ import copy +from typing import Optional class Dict(dict): @@ -95,11 +96,14 @@ class Dict(dict): base[key] = value return base - def get_multi(self, key_str: str): - keys = key_str.split(".") - res = self - for key in keys: - res = res.get(key) + def get_multi(self, key_str: str) -> Optional[any]: + try: + keys = key_str.split(".") + res = self + for key in keys: + res = res.get(key) + except Exception as _: + return None return res def set_multi(self, key_str: str, value: any): diff --git a/environment/1_sdk/sysom_utils/config_parser.py b/environment/1_sdk/sysom_utils/config_parser.py index 6984593775f3da10a19354ba5e7961a1bffb15ed..9decb6670470d8174d84c911f68a942f7b08097c 100644 --- a/environment/1_sdk/sysom_utils/config_parser.py +++ b/environment/1_sdk/sysom_utils/config_parser.py @@ -9,12 +9,13 @@ Description: import yaml from enum import Enum import os +from clogger import logger from .yaml_concat import YamlConcatConstructor from .adddict import Dict def dict_merge(dct: dict, merge_dct: dict): - """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of + """Recursive dict merge. Inspired by :meth:``dict.update()``, instead of updating only top-level keys, dict_merge recurses down into dicts nested to an arbitrary depth, updating keys. The ``merge_dct`` is merged into ``dct``. @@ -28,12 +29,35 @@ def dict_merge(dct: dict, merge_dct: dict): if dct is None or merge_dct is None: return for k, v in merge_dct.items(): - if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)): # noqa + if ( + k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict) + ): # noqa dict_merge(dct[k], merge_dct[k]) else: dct[k] = merge_dct[k] +def get_all_sysom_env_from_proc_1() -> dict: + env = {} + try: + with open("/proc/1/environ", "r") as f: + for line in f.readlines(): + line = line.strip() + if line == "": + continue + envs = line.split("\0") + for _env in envs: + _env = _env.strip() + if _env == "": + continue + k, v = _env.split("=", 1) + if k.startswith("sysom"): + env[k] = v + except Exception as e: + logger.warning(e) + return env + + SYSOM_CONFIG_SECTION_GLOBAL = "sysom_global" SYSOM_CONFIG_SECTION_SERVER = "sysom_server" SYSOM_CONFIG_SECTION_WEB = "sysom_web" @@ -49,9 +73,10 @@ ENV_LIST = { "REDIS_HOST": "sysom_server.db.redis.host", "REDIS_PORT": "sysom_server.db.redis.port", "REDIS_USERNAME": "sysom_server.db.redis.username", - "REDIS_PASSWORD": "sysom_server.db.redis.password" + "REDIS_PASSWORD": "sysom_server.db.redis.password", } + class ConfigParserException(Exception): pass @@ -68,15 +93,20 @@ class ConfigParser: self.service_config_path = service_config_path self._config: Dict = self._load_config() self._overwrite_from_env() - + def _overwrite_from_env(self) -> None: + proc_1_envs = get_all_sysom_env_from_proc_1() + for k, v in proc_1_envs.items(): + key = k.replace("___", ".") + self._config.set_multi(key, v) for env in os.environ: if env.startswith("sysom"): - self._config.set_multi(env, os.environ[env]) + key = env.replace("___", ".") + self._config.set_multi(key, os.environ[env]) for env, key_str in ENV_LIST.items(): if os.getenv(env): self._config.set_multi(key_str, os.getenv(env)) - + def _load_config(self) -> Dict: YamlConcatConstructor.add_to_loader_class(loader_class=yaml.FullLoader) global_config: dict = {} @@ -136,9 +166,25 @@ class ConfigParser: params.append(f"username={redis_config.username}") if redis_config.password: params.append(f"password={redis_config.password}") + elif cec_config.protocol == "kafka": + kafka_config = server_config.db.kafka + cec_url = ( + f"{cec_config.protocol}://{kafka_config.host}:{kafka_config.port}?" + ) + if kafka_config.mechanisms: + params.append(f"sasl.mechanisms={kafka_config.mechanisms}") + if kafka_config.ca_location: + params.append(f"ssl.ca.location={kafka_config.ca_location}") + if kafka_config.securityprotocol: + params.append(f"security.protocol={kafka_config.securityprotocol}") + if kafka_config.username: + params.append(f"sasl.username={kafka_config.username}") + if kafka_config.password: + params.append(f"sasl.password={kafka_config.password}") else: raise ConfigParserException( - f"Not support cec protocol: {cec_config.protocol}") + f"Not support cec protocol: {cec_config.protocol}" + ) for k in special_param: params.append(f"{k}={special_param[k]}") cec_url += "&".join(params) @@ -165,7 +211,7 @@ class ConfigParser: params.append(f"{k}={special_param[k]}") cmg_url += "&".join(params) return cmg_url - + def get_gcache_url(self) -> str: service_config = self.get_service_config() server_config = self.get_server_config() @@ -188,7 +234,7 @@ class ConfigParser: params.append(f"{k}={special_param[k]}") gcache_url += "&".join(params) return gcache_url - + def get_gclient_url(self, service_name: str) -> str: service_config = self.get_service_config() gclient_config = service_config.framework.gclient @@ -216,7 +262,7 @@ class ConfigParser: params = [ f"channel_job_target_topic={channel_job_config.target_topic}", f"channel_job_listen_topic={channel_job_config.listen_topic}", - f"channel_job_consumer_group={channel_job_config.consumer_group}" + f"channel_job_consumer_group={channel_job_config.consumer_group}", ] if cec_url != "" and cec_url[-1] != "?": cec_url += "&" diff --git a/environment/1_sdk/sysom_utils/fastapi_helper.py b/environment/1_sdk/sysom_utils/fastapi_helper.py index c406273d9e0fd73cd08ab999e74b50c5810b6a3b..7142a3377e2c2dedbf6b0cc02d1bdbc9b66e6fc6 100644 --- a/environment/1_sdk/sysom_utils/fastapi_helper.py +++ b/environment/1_sdk/sysom_utils/fastapi_helper.py @@ -5,17 +5,39 @@ Email mfeng@linux.alibaba.com File schemas.py Description: """ + +import jwt from functools import lru_cache from abc import abstractmethod -from typing import Any, Optional, Generic, Type, TypeVar, List, Union +from typing import ( + Any, + Optional, + Generic, + Tuple, + Type, + TypeVar, + List, + Union, + Dict, + Callable, +) from clogger import logger +from fastapi.requests import HTTPConnection from pydantic import BaseModel, create_model from starlette.requests import Request +from starlette.responses import JSONResponse from sqlalchemy import desc, asc, and_ from sqlalchemy.orm import Session, Query from sqlalchemy.sql import func from sqlalchemy.orm.attributes import InstrumentedAttribute from sqlalchemy.sql.sqltypes import Integer, String, Enum, SmallInteger +from starlette.authentication import ( + AuthenticationBackend, + AuthCredentials, + AuthenticationError, + BaseUser, +) +from starlette.middleware.authentication import AuthenticationMiddleware ####################################################################################### @@ -82,9 +104,15 @@ class QueryBuilder: class BaseQueryParams(BaseModel): - current: int = 1 - pageSize: int = 10 - sort__: str = "-created_at" + current: int = 1 # 当前页码 + pageSize: int = 10 # 每页数量 + sort__: str = "-created_at" # 排序字段 + like_fields: List[str] = [] # 模糊查询字段 + # https://stackoverflow.com/questions/53264047/sqlalchemy-filter-by-json-field + json_fields: List[str] = [] # JSON字段 + json_map_fields: Dict[str, str] = ( + {} + ) # JSON字段映射, eg.: {"channel": "params.channel"} __modelclass__: Optional[object] = None @@ -104,6 +132,9 @@ class BaseQueryParams(BaseModel): filter_params.pop("current", "") filter_params.pop("pageSize", "") filter_params.pop("sort__", "") + filter_params.pop("like_fields", []) + filter_params.pop("json_fields", []) + filter_params.pop("json_map_fields", {}) filters = [] for k, v in filter_params.items(): @@ -116,7 +147,11 @@ class BaseQueryParams(BaseModel): if not v or len(values) <= 0: continue - field_type = type(self.get_model_class().__dict__[k].type) + field_type = String + if k in self.json_map_fields: + field_type = String + else: + field_type = type(self.get_model_class().__dict__[k].type) if field_type in [Integer, SmallInteger]: values = [int(v_) for v_ in values] elif field_type == Enum: @@ -131,11 +166,43 @@ class BaseQueryParams(BaseModel): pass if len(values) > 1: - # 过滤多个,eg.: alert_item=test5,test6 - filters.append(self.get_model_class().__dict__[k].in_(values)) + if k in self.json_map_fields: + map_value = self.json_map_fields[k].split(".") + if len(map_value) != 2: + raise Exception(f"json_map_fields {k} value error") + origin_field, sub_field = map_value[0], map_value[1] + filters.append( + self.get_model_class() + .__dict__[origin_field][sub_field] + .in_(values) + ) + if k in self.json_fields: + # JSON字段过滤,eg.: instance=192.168.0.1,level=warning + for value in values: + j_k, j_v = value.split("=") + filters.append(self.get_model_class().__dict__[k][j_k] == j_v) + else: + # 过滤多个,eg.: alert_item=test5,test6 + filters.append(self.get_model_class().__dict__[k].in_(values)) else: - # 过滤单个,eg.: status=RESOLVED - filters.append(self.get_model_class().__dict__[k] == values[0]) + if k in self.json_map_fields: + # JSON字段映射过滤,eg.: {"channel": "params.channel"} + map_value = self.json_map_fields[k].split(".") + if len(map_value) != 2: + raise Exception(f"json_map_fields {k} value error") + origin_field, sub_field = map_value[0], map_value[1] + filters.append( + self.get_model_class().__dict__[origin_field][sub_field] + == values[0] + ) + elif k in self.json_fields: + # JSON字段过滤,eg.: instance=192.168.0.1 + j_k, j_v = values[0].split("=") + filters.append(self.get_model_class().__dict__[k][j_k] == j_v) + elif k in self.like_fields and "%" in values[0]: + filters.append(self.get_model_class().__dict__[k].like(values[0])) + else: + filters.append(self.get_model_class().__dict__[k] == values[0]) if len(filters) > 0: return and_(*filters) @@ -313,7 +380,6 @@ class StandardResponse(Generic[M, S]): response_type = get_standard_response_model(schema_class) response_data = schema_class.from_orm(data) else: - print(data, type(data)) raise Exception("data must be orm model object or pydantic object") return response_type(code=code, message=message, data=response_data) @@ -391,3 +457,61 @@ class FastApiResponseHelper: def bind_base_class(base_class: object): StandardResponse.bind_base_class(base_class) StandardListResponse.bind_base_class(base_class) + + +############################################################################################################# +# Auth +############################################################################################################# + + +class FastAPIUser(BaseUser): + def __init__(self, user_id: str) -> None: + self.user_id = user_id + + +class FastAPIJWTAuthBackend(AuthenticationBackend): + def __init__( + self, + jwt_secret_key: str, + jwt_algo: str = "HS256", + jwt_expire: int = 3600, + excluded_urls: List[str] = None, + ) -> None: + self.jwt_secret_key = jwt_secret_key + self.jwt_algo = jwt_algo + self.jwt_expire = jwt_expire + self.excluded_urls = excluded_urls or [] + + async def decode_token(self, token: str): + return jwt.decode(token, self.jwt_secret_key, algorithms=[self.jwt_algo]) + + async def authenticate( + self, conn: HTTPConnection + ) -> Tuple[AuthCredentials, BaseUser]: + if conn.url.path in self.excluded_urls: + return AuthCredentials(scopes=[]), FastAPIUser(user_id="anonymous") + token = conn.headers.get("Authorization", None) + if token is None: + raise AuthenticationError("Not authenticated") + payload = await self.decode_token(token) + scopes, user_id = payload.get("scopes", []), payload.get("id") + return AuthCredentials(scopes=scopes), FastAPIUser(user_id=user_id) + + +class JWTAuthMiddleware(AuthenticationMiddleware): + def __init__( + self, + app, + jwt_secret_key: str, + jwt_algo: str = "HS256", + jwt_expire: int = 3600, + excluded_urls: List[str] = None, + auth_error_handler: Callable[ + [Request, AuthenticationError], JSONResponse + ] = None, + ) -> None: + super().__init__( + app, + FastAPIJWTAuthBackend(jwt_secret_key, jwt_algo, jwt_expire, excluded_urls), + on_error=auth_error_handler, + ) diff --git a/environment/1_sdk/sysom_utils/framework.py b/environment/1_sdk/sysom_utils/framework.py index b67370247f12f61453b5aef172bd8735823fd4a0..d023f0aa01c97848779fb279cfac2241ea97f743 100644 --- a/environment/1_sdk/sysom_utils/framework.py +++ b/environment/1_sdk/sysom_utils/framework.py @@ -36,15 +36,39 @@ class SysomFramework: _config: Optional[ConfigParser] = None _gcache_map: Dict[str, GCache] = {} _framework_plug_mag: Optional[FrameworkPlugMag] = None - _alarm_producer: Optional[Producer] = None + _cec_producer: Optional[Producer] = None @classmethod def init(cls, config: ConfigParser): cls._config = config cls._framework_plug_mag = FrameworkPlugMag(config) cls.init_logger(config) + cls.catch_kill_sig_then_kill_all_child() return cls + @classmethod + def catch_kill_sig_then_kill_all_child(cls): + """Catch kill signal and kill all child process + """ + def signal_handler(signum, frame): + parent = psutil.Process(pid) + for child in parent.children(recursive=True): # 获取所有子进程 + child.kill() + child.wait() + # 主动退出当前进程 + os._exit(1) + import signal + import os + import psutil + + pid = os.getpid() + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGHUP, signal_handler) + signal.signal(signal.SIGQUIT, signal_handler) + signal.signal(signal.SIGABRT, signal_handler) + @classmethod def init_logger(cls, config: ConfigParser): """Init clogger @@ -178,16 +202,16 @@ class SysomFramework: ) default_channel_job_executor.start() return cls + + @classmethod + def _get_cec_producer(cls): + if cls._cec_producer is None: + cls._cec_producer = cls.cec_producer() + return cls._cec_producer ################################################################################ # Alarm ################################################################################ - @classmethod - def _get_alarm_producer(cls): - if cls._alarm_producer is None: - cls._alarm_producer = cls.cec_producer() - return cls._alarm_producer - @classmethod def alarm(cls, alert_data): """Dispatch one SAD alert data to event center @@ -195,7 +219,7 @@ class SysomFramework: Args: alert_data (_type_): _description_ """ - cls._get_alarm_producer().produce("SYSOM_SAD_ALERT", alert_data) + cls._get_cec_producer().produce("SYSOM_SAD_ALERT", alert_data) @classmethod def alarm_application( @@ -245,9 +269,21 @@ class SysomFramework: """ if action not in ["ADD_ANNOTATION", "ADD_OPT", "MERGE"]: raise Exception(f"Not support alarm action: {action}") - cls._get_alarm_producer().produce( + cls._get_cec_producer().produce( "SYSOM_ALARM_ACTION", {"action": action, "data": action_data} ) + + ################################################################################ + # CLUSTER HEALTH + ################################################################################ + @classmethod + def abnormal_metric(cls, metric_data): + """Dispatch one abnormal data to event center + + Args: + alert_data (_type_): _description_ + """ + cls._get_cec_producer().produce("SYSOM_HEALTH_METRIC", metric_data) @classmethod def start(cls): diff --git a/package_rpm_online.sh b/package_rpm_online.sh index 4722e5f8bd362037dadbe168c2e569d6c2f1dee9..7f663dcb7022f519a2143674aa37535169856f4b 100755 --- a/package_rpm_online.sh +++ b/package_rpm_online.sh @@ -1,5 +1,5 @@ #!/bin/bash -version=3.0 +version=3.1 check_cmd() { local cmd="$1" diff --git a/script/server/sysom_ad_proxy/sysom-ad_proxy.ini b/script/server/sysom_ad_proxy/sysom-ad_proxy.ini index 49ab06cef535e8e3f7b206dfbd8e34d6c996702a..c2dc73bd4a59b66f6fb49dd5f8a70941340ec626 100644 --- a/script/server/sysom_ad_proxy/sysom-ad_proxy.ini +++ b/script/server/sysom_ad_proxy/sysom-ad_proxy.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-ad_proxy-error.log stdout_logfile=/var/log/sysom/sysom-ad_proxy.log diff --git a/script/server/sysom_alarm/sysom-alarm.ini b/script/server/sysom_alarm/sysom-alarm.ini index 01254501a63732f23b9c5f09627ea48ef312b77e..63814f08915dfd5a34934f20a1e3303ecbcc1db3 100644 --- a/script/server/sysom_alarm/sysom-alarm.ini +++ b/script/server/sysom_alarm/sysom-alarm.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-alarm-error.log stdout_logfile=/var/log/sysom/sysom-alarm.log diff --git a/script/server/sysom_alert_pusher/clear.sh b/script/server/sysom_alert_pusher/clear.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad96fb9c704794eb8ec41c6d6054a20c95429a22 --- /dev/null +++ b/script/server/sysom_alert_pusher/clear.sh @@ -0,0 +1,14 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-alert_pusher + +clear_app() { + rm -rf /etc/supervisord.d/${SERVICE_NAME}.ini + ###use supervisorctl update to stop and clear services### + supervisorctl update +} + +# Stop first +bash -x $BaseDir/stop.sh + +clear_app diff --git a/script/server/sysom_alert_pusher/db_migrate.sh b/script/server/sysom_alert_pusher/db_migrate.sh new file mode 100644 index 0000000000000000000000000000000000000000..f530e1f82f0e3becd05b158b68263bc37fbd0447 --- /dev/null +++ b/script/server/sysom_alert_pusher/db_migrate.sh @@ -0,0 +1,18 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +db_migrate() { + pushd ${SERVICE_HOME} + alembic upgrade head + popd +} + +source_virtualenv +db_migrate \ No newline at end of file diff --git a/script/server/sysom_alert_pusher/init.sh b/script/server/sysom_alert_pusher/init.sh new file mode 100644 index 0000000000000000000000000000000000000000..2cb5fb7a6e6fa1e6c0c9dcad2a134d1fc0ddfc75 --- /dev/null +++ b/script/server/sysom_alert_pusher/init.sh @@ -0,0 +1,25 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-alert_pusher + +init_conf() { + cp ${SERVICE_NAME}.ini /etc/supervisord.d/ + ###change the install dir base on param $1### + sed -i "s;/usr/local/sysom;${APP_HOME};g" /etc/supervisord.d/${SERVICE_NAME}.ini + cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l) + sed -i "s/threads = 3/threads = $cpu_num/g" ${SERVICE_HOME}/conf/gunicorn.py +} + +init_app() { + init_conf + bash -x $BaseDir/db_migrate.sh + ###if supervisor service started, we need use "supervisorctl update" to start new conf#### + supervisorctl update +} + +init_app + +# Start +bash -x $BaseDir/start.sh diff --git a/script/server/sysom_alert_pusher/install.sh b/script/server/sysom_alert_pusher/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..f64f8fb85ed2371d4c108e5ab9ea629fdd2aa7e2 --- /dev/null +++ b/script/server/sysom_alert_pusher/install.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +SERVICE_SCRIPT_HOME=${MICROSERVICE_SCRIPT_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME +SERVICE_NAME=sysom-alert_pusher + +if [ "$UID" -ne 0 ]; then + echo "Please run as root" + exit 1 +fi + +install_requirement() { + pushd ${SERVICE_SCRIPT_HOME} + pip install -r requirements.txt + popd +} + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +install_app() { + source_virtualenv + install_requirement +} + +install_app diff --git a/script/server/sysom_alert_pusher/requirements.txt b/script/server/sysom_alert_pusher/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c2a17dfe74f495037656202e288b21bf01cc996 --- /dev/null +++ b/script/server/sysom_alert_pusher/requirements.txt @@ -0,0 +1,19 @@ +clogger==0.0.1 +channel_job>=0.0.1 +cec_base>=0.0.1 +cec_redis>=0.0.1 +sysom_utils>=0.0.1 +alembic==1.7.7 +anyio==3.6.2 +asyncer==0.0.2 +asyncssh==2.12.0 +fastapi==0.83.0 +PyMySQL==1.0.2 +pyyaml==6.0 +pyyaml-include==1.3 +uvicorn==0.16.0 +gunicorn==20.1.0 +python-multipart==0.0.5 +###################################################################### +# Add your custom python requirements here +###################################################################### \ No newline at end of file diff --git a/script/server/sysom_alert_pusher/start.sh b/script/server/sysom_alert_pusher/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..9346d5c444e6c8b4b5f472b87f79e3e518d35de2 --- /dev/null +++ b/script/server/sysom_alert_pusher/start.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_NAME=sysom-alert_pusher + +is_start() { + status=`supervisorctl status ${SERVICE_NAME} | awk '{print $2}'` + result=`echo "RUNNING STARTING" | grep $status` + if [[ "$result" != "" ]] + then + return 1 + else + return 0 + fi +} + +start_app() { + is_start + if [[ $? == 0 ]]; then + supervisorctl start $SERVICE_NAME + is_start + if [[ $? == 0 ]]; then + echo "${SERVICE_NAME} service start fail, please check log" + exit 1 + else + echo "supervisorctl start ${SERVICE_NAME} success..." + fi + fi +} + +start_app diff --git a/script/server/sysom_alert_pusher/stop.sh b/script/server/sysom_alert_pusher/stop.sh new file mode 100644 index 0000000000000000000000000000000000000000..bc7a7f708545bf16d5323b8f4f4df3778cbc3035 --- /dev/null +++ b/script/server/sysom_alert_pusher/stop.sh @@ -0,0 +1,8 @@ +#!/bin/bash +SERVICE_NAME=sysom-alert_pusher + +stop_app() { + supervisorctl stop $SERVICE_NAME +} + +stop_app diff --git a/script/server/sysom_alert_pusher/sysom-alert_pusher.ini b/script/server/sysom_alert_pusher/sysom-alert_pusher.ini new file mode 100644 index 0000000000000000000000000000000000000000..4d07c014db7ce705795c363abdc352f15100650a --- /dev/null +++ b/script/server/sysom_alert_pusher/sysom-alert_pusher.ini @@ -0,0 +1,9 @@ +[program:sysom-alert_pusher] +directory=/usr/local/sysom/server/sysom_alert_pusher +command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn.py main:app +startsecs=3 +autostart=true +autorestart=true +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s +stderr_logfile=/var/log/sysom/sysom-alert_pusher-error.log +stdout_logfile=/var/log/sysom/sysom-alert_pusher.log diff --git a/script/server/sysom_alert_pusher/uninstall.sh b/script/server/sysom_alert_pusher/uninstall.sh new file mode 100644 index 0000000000000000000000000000000000000000..0aac04c09286c7f24e98d8a20ce131cdbec36375 --- /dev/null +++ b/script/server/sysom_alert_pusher/uninstall.sh @@ -0,0 +1,9 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) + +uninstall_app() { + # do nothing + echo "" +} + +uninstall_app \ No newline at end of file diff --git a/script/server/sysom_cec_proxy/sysom-cec_proxy.ini b/script/server/sysom_cec_proxy/sysom-cec_proxy.ini index 265bb4195d623ba0236a2d8c057d3315634231c3..6fe383efad3a7118d1703913de9378f0014d3d5d 100644 --- a/script/server/sysom_cec_proxy/sysom-cec_proxy.ini +++ b/script/server/sysom_cec_proxy/sysom-cec_proxy.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-cec_proxy-error.log stdout_logfile=/var/log/sysom/sysom-cec_proxy.log diff --git a/script/server/sysom_channel/sysom-channel.ini b/script/server/sysom_channel/sysom-channel.ini index 25d774615910fdb6abad6ca6ca5f54148de77c67..fb78a6b7df545788ef41e7f1280ef353a508f887 100644 --- a/script/server/sysom_channel/sysom-channel.ini +++ b/script/server/sysom_channel/sysom-channel.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-channel-error.log stdout_logfile=/var/log/sysom/sysom-channel.log diff --git a/script/server/sysom_cluster_health/clear.sh b/script/server/sysom_cluster_health/clear.sh new file mode 100644 index 0000000000000000000000000000000000000000..1a564ac41ab326af51c23ea94b5920b3a5c56073 --- /dev/null +++ b/script/server/sysom_cluster_health/clear.sh @@ -0,0 +1,14 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-cluster_health + +clear_app() { + rm -rf /etc/supervisord.d/${SERVICE_NAME}.ini + ###use supervisorctl update to stop and clear services### + supervisorctl update +} + +# Stop first +bash -x $BaseDir/stop.sh + +clear_app diff --git a/script/server/sysom_cluster_health/db_migrate.sh b/script/server/sysom_cluster_health/db_migrate.sh new file mode 100644 index 0000000000000000000000000000000000000000..f530e1f82f0e3becd05b158b68263bc37fbd0447 --- /dev/null +++ b/script/server/sysom_cluster_health/db_migrate.sh @@ -0,0 +1,18 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +db_migrate() { + pushd ${SERVICE_HOME} + alembic upgrade head + popd +} + +source_virtualenv +db_migrate \ No newline at end of file diff --git a/script/server/sysom_cluster_health/init.sh b/script/server/sysom_cluster_health/init.sh new file mode 100644 index 0000000000000000000000000000000000000000..5446f8489db614bb8fa56897330a801f9d66dab6 --- /dev/null +++ b/script/server/sysom_cluster_health/init.sh @@ -0,0 +1,25 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-cluster_health + +init_conf() { + cp ${SERVICE_NAME}.ini /etc/supervisord.d/ + ###change the install dir base on param $1### + sed -i "s;/usr/local/sysom;${APP_HOME};g" /etc/supervisord.d/${SERVICE_NAME}.ini + cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l) + sed -i "s/threads = 3/threads = $cpu_num/g" ${SERVICE_HOME}/conf/gunicorn.py +} + +init_app() { + init_conf + bash -x $BaseDir/db_migrate.sh + ###if supervisor service started, we need use "supervisorctl update" to start new conf#### + supervisorctl update +} + +init_app + +# Start +bash -x $BaseDir/start.sh diff --git a/script/server/sysom_cluster_health/install.sh b/script/server/sysom_cluster_health/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..ac8e2c906cbd5f965221dce57bc4f8e3a16d81e1 --- /dev/null +++ b/script/server/sysom_cluster_health/install.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +SERVICE_SCRIPT_HOME=${MICROSERVICE_SCRIPT_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME +SERVICE_NAME=sysom-cluster_health + +if [ "$UID" -ne 0 ]; then + echo "Please run as root" + exit 1 +fi + +install_requirement() { + pushd ${SERVICE_SCRIPT_HOME} + pip install -r requirements.txt + popd +} + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +install_app() { + source_virtualenv + install_requirement +} + +install_app diff --git a/script/server/sysom_cluster_health/requirements.txt b/script/server/sysom_cluster_health/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..75e0abe0ff741a79fdf8665785fc74999582a1ce --- /dev/null +++ b/script/server/sysom_cluster_health/requirements.txt @@ -0,0 +1,24 @@ +clogger==0.0.1 +channel_job>=0.0.1 +cec_base>=0.0.1 +cec_redis>=0.0.1 +sysom_utils>=0.0.1 +alembic==1.7.7 +anyio==3.6.2 +asyncer==0.0.2 +asyncssh==2.12.0 +fastapi==0.83.0 +PyMySQL==1.0.2 +pyyaml==6.0 +pyyaml-include==1.3 +uvicorn==0.16.0 +gunicorn==20.1.0 +python-multipart==0.0.5 +pandas>=1.1.5 +prometheus_client==0.16.0 +requests==2.27.1 +schedule==1.1.0 +scipy==1.5.4 +###################################################################### +# Add your custom python requirements here +###################################################################### \ No newline at end of file diff --git a/script/server/sysom_cluster_health/start.sh b/script/server/sysom_cluster_health/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..1afc1a10c904b94892020b1008e213cc77661066 --- /dev/null +++ b/script/server/sysom_cluster_health/start.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_NAME=sysom-cluster_health + +is_start() { + status=`supervisorctl status ${SERVICE_NAME} | awk '{print $2}'` + result=`echo "RUNNING STARTING" | grep $status` + if [[ "$result" != "" ]] + then + return 1 + else + return 0 + fi +} + +start_app() { + is_start + if [[ $? == 0 ]]; then + supervisorctl start $SERVICE_NAME + is_start + if [[ $? == 0 ]]; then + echo "${SERVICE_NAME} service start fail, please check log" + exit 1 + else + echo "supervisorctl start ${SERVICE_NAME} success..." + fi + fi +} + +start_app diff --git a/script/server/sysom_cluster_health/stop.sh b/script/server/sysom_cluster_health/stop.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed3eeb94bc8d288df7ce10efd94cfa6a07bc8e36 --- /dev/null +++ b/script/server/sysom_cluster_health/stop.sh @@ -0,0 +1,8 @@ +#!/bin/bash +SERVICE_NAME=sysom-cluster_health + +stop_app() { + supervisorctl stop $SERVICE_NAME +} + +stop_app diff --git a/script/server/sysom_cluster_health/sysom-cluster_health.ini b/script/server/sysom_cluster_health/sysom-cluster_health.ini new file mode 100644 index 0000000000000000000000000000000000000000..492815bfcea10363c0750032bf667d7f432fc2b6 --- /dev/null +++ b/script/server/sysom_cluster_health/sysom-cluster_health.ini @@ -0,0 +1,9 @@ +[program:sysom-cluster_health] +directory=/usr/local/sysom/server/sysom_cluster_health +command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn.py main:app +startsecs=3 +autostart=true +autorestart=true +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s +stderr_logfile=/var/log/sysom/sysom-cluster_health-error.log +stdout_logfile=/var/log/sysom/sysom-cluster_health.log diff --git a/script/server/sysom_cluster_health/uninstall.sh b/script/server/sysom_cluster_health/uninstall.sh new file mode 100644 index 0000000000000000000000000000000000000000..0aac04c09286c7f24e98d8a20ce131cdbec36375 --- /dev/null +++ b/script/server/sysom_cluster_health/uninstall.sh @@ -0,0 +1,9 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) + +uninstall_app() { + # do nothing + echo "" +} + +uninstall_app \ No newline at end of file diff --git a/script/server/sysom_cmg/clear.sh b/script/server/sysom_cmg/clear.sh new file mode 100644 index 0000000000000000000000000000000000000000..d1d29a5abc8420b660aac4da29a7b762ef89a2d1 --- /dev/null +++ b/script/server/sysom_cmg/clear.sh @@ -0,0 +1,14 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-cmg + +clear_app() { + rm -rf /etc/supervisord.d/${SERVICE_NAME}.ini + ###use supervisorctl update to stop and clear services### + supervisorctl update +} + +# Stop first +bash -x $BaseDir/stop.sh + +clear_app diff --git a/script/server/sysom_cmg/db_migrate.sh b/script/server/sysom_cmg/db_migrate.sh new file mode 100644 index 0000000000000000000000000000000000000000..f530e1f82f0e3becd05b158b68263bc37fbd0447 --- /dev/null +++ b/script/server/sysom_cmg/db_migrate.sh @@ -0,0 +1,18 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +db_migrate() { + pushd ${SERVICE_HOME} + alembic upgrade head + popd +} + +source_virtualenv +db_migrate \ No newline at end of file diff --git a/script/server/sysom_cmg/init.sh b/script/server/sysom_cmg/init.sh new file mode 100644 index 0000000000000000000000000000000000000000..1298b134f2327f2ed62631d84c0e7fc77be13861 --- /dev/null +++ b/script/server/sysom_cmg/init.sh @@ -0,0 +1,25 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-cmg + +init_conf() { + cp ${SERVICE_NAME}.ini /etc/supervisord.d/ + ###change the install dir base on param $1### + sed -i "s;/usr/local/sysom;${APP_HOME};g" /etc/supervisord.d/${SERVICE_NAME}.ini + cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l) + sed -i "s/threads = 3/threads = $cpu_num/g" ${SERVICE_HOME}/conf/gunicorn.py +} + +init_app() { + init_conf + bash -x $BaseDir/db_migrate.sh + ###if supervisor service started, we need use "supervisorctl update" to start new conf#### + supervisorctl update +} + +init_app + +# Start +bash -x $BaseDir/start.sh diff --git a/script/server/sysom_cmg/install.sh b/script/server/sysom_cmg/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..b7715e4bbebf89e2ca3689c5f2a176753b55a646 --- /dev/null +++ b/script/server/sysom_cmg/install.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +SERVICE_SCRIPT_HOME=${MICROSERVICE_SCRIPT_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME +SERVICE_NAME=sysom-cmg + +if [ "$UID" -ne 0 ]; then + echo "Please run as root" + exit 1 +fi + +install_requirement() { + pushd ${SERVICE_SCRIPT_HOME} + pip install -r requirements.txt + popd +} + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +install_app() { + source_virtualenv + install_requirement +} + +install_app diff --git a/script/server/sysom_cmg/requirements.txt b/script/server/sysom_cmg/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c2a17dfe74f495037656202e288b21bf01cc996 --- /dev/null +++ b/script/server/sysom_cmg/requirements.txt @@ -0,0 +1,19 @@ +clogger==0.0.1 +channel_job>=0.0.1 +cec_base>=0.0.1 +cec_redis>=0.0.1 +sysom_utils>=0.0.1 +alembic==1.7.7 +anyio==3.6.2 +asyncer==0.0.2 +asyncssh==2.12.0 +fastapi==0.83.0 +PyMySQL==1.0.2 +pyyaml==6.0 +pyyaml-include==1.3 +uvicorn==0.16.0 +gunicorn==20.1.0 +python-multipart==0.0.5 +###################################################################### +# Add your custom python requirements here +###################################################################### \ No newline at end of file diff --git a/script/server/sysom_cmg/start.sh b/script/server/sysom_cmg/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..95b9a42eacae941985dd63349419b873f89d8eb3 --- /dev/null +++ b/script/server/sysom_cmg/start.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_NAME=sysom-cmg + +is_start() { + status=`supervisorctl status ${SERVICE_NAME} | awk '{print $2}'` + result=`echo "RUNNING STARTING" | grep $status` + if [[ "$result" != "" ]] + then + return 1 + else + return 0 + fi +} + +start_app() { + is_start + if [[ $? == 0 ]]; then + supervisorctl start $SERVICE_NAME + is_start + if [[ $? == 0 ]]; then + echo "${SERVICE_NAME} service start fail, please check log" + exit 1 + else + echo "supervisorctl start ${SERVICE_NAME} success..." + fi + fi +} + +start_app diff --git a/script/server/sysom_cmg/stop.sh b/script/server/sysom_cmg/stop.sh new file mode 100644 index 0000000000000000000000000000000000000000..7c747e127e7438b2138ef8158a01028e5001bdbc --- /dev/null +++ b/script/server/sysom_cmg/stop.sh @@ -0,0 +1,8 @@ +#!/bin/bash +SERVICE_NAME=sysom-cmg + +stop_app() { + supervisorctl stop $SERVICE_NAME +} + +stop_app diff --git a/script/server/sysom_cmg/sysom-cmg.ini b/script/server/sysom_cmg/sysom-cmg.ini new file mode 100644 index 0000000000000000000000000000000000000000..f4f381b0e6f361687b724a56158195d58e42ad8b --- /dev/null +++ b/script/server/sysom_cmg/sysom-cmg.ini @@ -0,0 +1,9 @@ +[program:sysom-cmg] +directory=/usr/local/sysom/server/sysom_cmg +command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn.py main:app +startsecs=3 +autostart=true +autorestart=true +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s +stderr_logfile=/var/log/sysom/sysom-cmg-error.log +stdout_logfile=/var/log/sysom/sysom-cmg.log diff --git a/script/server/sysom_cmg/uninstall.sh b/script/server/sysom_cmg/uninstall.sh new file mode 100644 index 0000000000000000000000000000000000000000..0aac04c09286c7f24e98d8a20ce131cdbec36375 --- /dev/null +++ b/script/server/sysom_cmg/uninstall.sh @@ -0,0 +1,9 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) + +uninstall_app() { + # do nothing + echo "" +} + +uninstall_app \ No newline at end of file diff --git a/script/server/sysom_colocation/clear.sh b/script/server/sysom_colocation/clear.sh new file mode 100644 index 0000000000000000000000000000000000000000..b83765d642bf69a0b4b992eb24304f4c928832ac --- /dev/null +++ b/script/server/sysom_colocation/clear.sh @@ -0,0 +1,14 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-colocation + +clear_app() { + rm -rf /etc/supervisord.d/${SERVICE_NAME}.ini + ###use supervisorctl update to stop and clear services### + supervisorctl update +} + +# Stop first +bash -x $BaseDir/stop.sh + +clear_app diff --git a/script/server/sysom_colocation/db_migrate.sh b/script/server/sysom_colocation/db_migrate.sh new file mode 100644 index 0000000000000000000000000000000000000000..f530e1f82f0e3becd05b158b68263bc37fbd0447 --- /dev/null +++ b/script/server/sysom_colocation/db_migrate.sh @@ -0,0 +1,18 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +db_migrate() { + pushd ${SERVICE_HOME} + alembic upgrade head + popd +} + +source_virtualenv +db_migrate \ No newline at end of file diff --git a/script/server/sysom_colocation/init.sh b/script/server/sysom_colocation/init.sh new file mode 100644 index 0000000000000000000000000000000000000000..c1cb2a6390efc2ff1c9431b8661212034fe1b093 --- /dev/null +++ b/script/server/sysom_colocation/init.sh @@ -0,0 +1,25 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +BaseDir=$(dirname $(readlink -f "$0")) +SERVICE_NAME=sysom-colocation + +init_conf() { + cp ${SERVICE_NAME}.ini /etc/supervisord.d/ + ###change the install dir base on param $1### + sed -i "s;/usr/local/sysom;${APP_HOME};g" /etc/supervisord.d/${SERVICE_NAME}.ini + cpu_num=$(cat /proc/cpuinfo | grep processor | wc -l) + sed -i "s/threads = 3/threads = $cpu_num/g" ${SERVICE_HOME}/conf/gunicorn.py +} + +init_app() { + init_conf + bash -x $BaseDir/db_migrate.sh + ###if supervisor service started, we need use "supervisorctl update" to start new conf#### + supervisorctl update +} + +init_app + +# Start +bash -x $BaseDir/start.sh diff --git a/script/server/sysom_colocation/install.sh b/script/server/sysom_colocation/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..ca78ffaa837fbdc1d7c39080f148c3f65a45a606 --- /dev/null +++ b/script/server/sysom_colocation/install.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_SCRIPT_DIR=$(basename $(dirname $0)) +SERVICE_HOME=${MICROSERVICE_HOME}/${SERVICE_SCRIPT_DIR} +SERVICE_SCRIPT_HOME=${MICROSERVICE_SCRIPT_HOME}/${SERVICE_SCRIPT_DIR} +VIRTUALENV_HOME=$GLOBAL_VIRTUALENV_HOME +SERVICE_NAME=sysom-colocation + +if [ "$UID" -ne 0 ]; then + echo "Please run as root" + exit 1 +fi + +install_requirement() { + pushd ${SERVICE_SCRIPT_HOME} + pip install -r requirements.txt + popd +} + +source_virtualenv() { + echo "INFO: activate virtualenv..." + source ${VIRTUALENV_HOME}/bin/activate || exit 1 +} + +install_app() { + source_virtualenv + install_requirement +} + +install_app diff --git a/script/server/sysom_colocation/requirements.txt b/script/server/sysom_colocation/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a33cd8a9e7b384055d4c7d2ca8f5458c76a5f37 --- /dev/null +++ b/script/server/sysom_colocation/requirements.txt @@ -0,0 +1,26 @@ +clogger==0.0.1 +channel_job>=0.0.1 +cec_base>=0.0.1 +cec_redis>=0.0.1 +sysom_utils>=0.0.1 +alembic==1.7.7 +anyio==3.6.2 +asyncer==0.0.2 +asyncssh==2.12.0 +fastapi==0.83.0 +PyMySQL==1.0.2 +pyyaml==6.0 +pyyaml-include==1.3 +uvicorn==0.16.0 +gunicorn==20.1.0 +python-multipart==0.0.5 +###################################################################### +# Add your custom python requirements here +###################################################################### +numpy==1.19.5 +pandas==1.1.5 +pytz==2023.3 +prometheus_client==0.16.0 +schedule==1.1.0 +requests==2.27.1 +scipy==1.5.4 \ No newline at end of file diff --git a/script/server/sysom_colocation/start.sh b/script/server/sysom_colocation/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..566ff632eb678d8a073d994077c46b316a4aba85 --- /dev/null +++ b/script/server/sysom_colocation/start.sh @@ -0,0 +1,29 @@ +#!/bin/bash +SERVICE_NAME=sysom-colocation + +is_start() { + status=`supervisorctl status ${SERVICE_NAME} | awk '{print $2}'` + result=`echo "RUNNING STARTING" | grep $status` + if [[ "$result" != "" ]] + then + return 1 + else + return 0 + fi +} + +start_app() { + is_start + if [[ $? == 0 ]]; then + supervisorctl start $SERVICE_NAME + is_start + if [[ $? == 0 ]]; then + echo "${SERVICE_NAME} service start fail, please check log" + exit 1 + else + echo "supervisorctl start ${SERVICE_NAME} success..." + fi + fi +} + +start_app diff --git a/script/server/sysom_colocation/stop.sh b/script/server/sysom_colocation/stop.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7f59c94689fad2a15909f867000b0c656ab561a --- /dev/null +++ b/script/server/sysom_colocation/stop.sh @@ -0,0 +1,8 @@ +#!/bin/bash +SERVICE_NAME=sysom-colocation + +stop_app() { + supervisorctl stop $SERVICE_NAME +} + +stop_app diff --git a/script/server/sysom_colocation/sysom-colocation.ini b/script/server/sysom_colocation/sysom-colocation.ini new file mode 100644 index 0000000000000000000000000000000000000000..caab2b8471f82c1286e5335053fb6f07fcf56f77 --- /dev/null +++ b/script/server/sysom_colocation/sysom-colocation.ini @@ -0,0 +1,9 @@ +[program:sysom-colocation] +directory=/usr/local/sysom/server/sysom_colocation +command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn.py main:app +startsecs=3 +autostart=true +autorestart=true +environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +stderr_logfile=/var/log/sysom/sysom-colocation-error.log +stdout_logfile=/var/log/sysom/sysom-colocation.log diff --git a/script/server/sysom_colocation/uninstall.sh b/script/server/sysom_colocation/uninstall.sh new file mode 100644 index 0000000000000000000000000000000000000000..0aac04c09286c7f24e98d8a20ce131cdbec36375 --- /dev/null +++ b/script/server/sysom_colocation/uninstall.sh @@ -0,0 +1,9 @@ +#!/bin/bash +BaseDir=$(dirname $(readlink -f "$0")) + +uninstall_app() { + # do nothing + echo "" +} + +uninstall_app \ No newline at end of file diff --git a/script/server/sysom_diagnosis/install.sh b/script/server/sysom_diagnosis/install.sh index bc48985e4f787d7c71f421aef21b4e3305f727e2..9e855a827700d31704e5c8cc3dfcae4f218305f3 100644 --- a/script/server/sysom_diagnosis/install.sh +++ b/script/server/sysom_diagnosis/install.sh @@ -24,6 +24,13 @@ source_virtualenv() { source ${VIRTUALENV_HOME}/bin/activate || exit 1 } +prepare_node() { + source_virtualenv + pushd ${SERVICE_SCRIPT_HOME} + python prepare_node.py /etc/sysom/config.yml ${SERVICE_HOME}/config.yml x86_64 || exit 1 + popd +} + install_required_packages() { # required jdk # Check whether java command exists, not exists then install @@ -32,6 +39,8 @@ install_required_packages() { rpm -q --quiet java-1.8.0-openjdk || yum install -y java-1.8.0-openjdk fi + prepare_node + pushd ${SERVICE_HOME}/service_scripts mkdir -p jfrFlold if [ ! -f jfrFlold/${STANDALONE_PACKAGE} ]; then diff --git a/script/server/sysom_diagnosis/prepare_node.py b/script/server/sysom_diagnosis/prepare_node.py new file mode 100644 index 0000000000000000000000000000000000000000..ae9eecb2236964807df7bf85400bf3303013ac54 --- /dev/null +++ b/script/server/sysom_diagnosis/prepare_node.py @@ -0,0 +1,12 @@ +import sys +from sysom_utils import NodeManager, ConfigParser + +if __name__ == "__main__": + if len(sys.argv) < 4: + print("Usage: python node_file_prepare.py ") + sys.exit(1) + global_config_path = sys.argv[1] + service_config_path = sys.argv[2] + arch = sys.argv[3] + config = ConfigParser(global_config_path, service_config_path) + NodeManager(config, None).prepare_files(arch) \ No newline at end of file diff --git a/script/server/sysom_hotfix/requirements.txt b/script/server/sysom_hotfix/requirements.txt index 903fdd70581bde4431dce43d3dced9708be4e0c5..433c06cee07b45b05a69d7e40b72ddb618e4c524 100644 --- a/script/server/sysom_hotfix/requirements.txt +++ b/script/server/sysom_hotfix/requirements.txt @@ -19,4 +19,5 @@ requests==2.27.1 gunicorn==20.1.0 xlwt==1.3.0 xlrd==2.0.1 -beautifulsoup4==4.12.2 \ No newline at end of file +beautifulsoup4==4.12.2 +openpyxl==3.1.2 \ No newline at end of file diff --git a/script/server/sysom_hotfix/sysom-hotfix.ini b/script/server/sysom_hotfix/sysom-hotfix.ini index e3f1b324b3aef81e07cee8a2b1be391f4ba68696..a3207eae76758df47c84b354cd87283afd8f02cc 100644 --- a/script/server/sysom_hotfix/sysom-hotfix.ini +++ b/script/server/sysom_hotfix/sysom-hotfix.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-hotfix-error.log stdout_logfile=/var/log/sysom/sysom-hotfix.log diff --git a/script/server/sysom_knowledge/sysom-knowledge.ini b/script/server/sysom_knowledge/sysom-knowledge.ini index 8a44014ac3d2069d62acabf7d5485c30d13dafac..5912a9aebfc4c986bd9c02c57f5b06a20eb08374 100644 --- a/script/server/sysom_knowledge/sysom-knowledge.ini +++ b/script/server/sysom_knowledge/sysom-knowledge.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-knowledge-error.log stdout_logfile=/var/log/sysom/sysom-knowledge.log diff --git a/script/server/sysom_log/sysom-log.ini b/script/server/sysom_log/sysom-log.ini index 6e25bf8e6602a141c4da5f4c7412111b215982fb..af9f04962aee50e138b216b7cd0fa21167489a7c 100644 --- a/script/server/sysom_log/sysom-log.ini +++ b/script/server/sysom_log/sysom-log.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-log-error.log stdout_logfile=/var/log/sysom/sysom-log.log diff --git a/script/server/sysom_metric_anomaly_detection/sysom-metric_anomaly_detection.ini b/script/server/sysom_metric_anomaly_detection/sysom-metric_anomaly_detection.ini index bf4a0c1d7ca03ac3989337051a31a200e3f2143a..874bc36647e56819c9151d90c4bb500e868df23e 100644 --- a/script/server/sysom_metric_anomaly_detection/sysom-metric_anomaly_detection.ini +++ b/script/server/sysom_metric_anomaly_detection/sysom-metric_anomaly_detection.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-metric_anomaly_detection-error.log stdout_logfile=/var/log/sysom/sysom-metric_anomaly_detection.log diff --git a/script/server/sysom_migration/sysom-migration.ini b/script/server/sysom_migration/sysom-migration.ini index feb252b892439f644547a7d2085d620059490aa7..0a7fa185ab06131018e43851c110ae5556b27963 100644 --- a/script/server/sysom_migration/sysom-migration.ini +++ b/script/server/sysom_migration/sysom-migration.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-migration-error.log stdout_logfile=/var/log/sysom/sysom-migration.log diff --git a/script/server/sysom_monitor_server/sysom-monitor-server.ini b/script/server/sysom_monitor_server/sysom-monitor-server.ini index cdb4aa60339d0138350b04f68d99995e099a4024..4f34e85c29b27f6655d48e5da7fb85a05f8de744 100644 --- a/script/server/sysom_monitor_server/sysom-monitor-server.ini +++ b/script/server/sysom_monitor_server/sysom-monitor-server.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-monitor-server-error.log stdout_logfile=/var/log/sysom/sysom-monitor-server.log diff --git a/script/server/sysom_rca/sysom-rca.ini b/script/server/sysom_rca/sysom-rca.ini index f3338d93c49293efd6ebdfaa98b7a694a6e360fa..177f5a6a50aa488e55130f19900f78f31d4c28cf 100644 --- a/script/server/sysom_rca/sysom-rca.ini +++ b/script/server/sysom_rca/sysom-rca.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-rca-error.log stdout_logfile=/var/log/sysom/sysom-rca.log diff --git a/script/server/sysom_vmcore/sysom-vmcore.ini b/script/server/sysom_vmcore/sysom-vmcore.ini index fcdb24ccfea1af643ab4b662624b8e1aa5a2405d..3cfa77a379bf28f9f36dd44934fa6e6ad76548fe 100644 --- a/script/server/sysom_vmcore/sysom-vmcore.ini +++ b/script/server/sysom_vmcore/sysom-vmcore.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-vmcore-error.log stdout_logfile=/var/log/sysom/sysom-vmcore.log diff --git a/script/server/sysom_vul/sysom-vul.ini b/script/server/sysom_vul/sysom-vul.ini index 18ef6a2cb246dfa61471f6c05de31f7bb41dddaa..e37dbd51997dadb4fc81d8e08effb18a3eab0adf 100644 --- a/script/server/sysom_vul/sysom-vul.ini +++ b/script/server/sysom_vul/sysom-vul.ini @@ -4,6 +4,6 @@ command=/usr/local/sysom/environment/virtualenv/bin/gunicorn -c ./conf/gunicorn. startsecs=3 autostart=true autorestart=true -environment=PATH="/usr/local/sysom/environment/virtualenv/bin/" +environment=PATH=/usr/local/sysom/virtualenv/bin:%(ENV_PATH)s stderr_logfile=/var/log/sysom/sysom-vul-error.log stdout_logfile=/var/log/sysom/sysom-vul.log diff --git a/script/sysak_build.sh b/script/sysak_build.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f3bb0f0f2f9ad7d2d3c0dda5fa84f7a3b8c613f --- /dev/null +++ b/script/sysak_build.sh @@ -0,0 +1,11 @@ +#!/bin/sh +yum install -y git docker + +sudo docker pull registry.cn-hangzhou.aliyuncs.com/sysom/sysom:v7.0 +mkdir -p /tmp/sysak +sudo docker run -d -v /tmp/sysak:/home/sysak -it --name "sysak_rpm_build" registry.cn-hangzhou.aliyuncs.com/sysom/sysom:v7.0 /bin/bash +#sudo docker exec -it sysak_rpm_build sh -c "cd /home/;git clone -b v2.1.0-rc1 https://gitee.com/anolis/sysak.git" +sudo docker exec -it sysak_rpm_build sh -c "cd /home/;git clone -b v$1 https://gitee.com/anolis/sysak.git" +sudo docker exec -it sysak_rpm_build sh -c "cd /home/sysak/rpm/;./sysak-build-nodep.sh $1 1" +sudo docker exec -it sysak_rpm_build sh -c "cp /home/sysak/rpm/BUILDROOT/sysak-$1-1.x86_64 /home/sysak/" +echo "sysak-$1-1.x86_64.rpm in /tmp/sysak/" diff --git a/script/sysom.sh b/script/sysom.sh index d94544bcbf7d886c7d64c6bc1d464f2536db4ddd..0167465f3bbc9e5c78cd584e5d9a52034a8c44c5 100755 --- a/script/sysom.sh +++ b/script/sysom.sh @@ -1,6 +1,12 @@ #!/bin/bash -x BaseDir=$(dirname $(readlink -f "$0")) LocalAppHome=$(dirname $BaseDir) + +# Load env from /proc/1/environ +cat /proc/1/environ | tr '\0' '\n' | awk -F= '{gsub(/\./, "___", $1); print "export \"" $1 "\"=\"" $2 "\""}' > /tmp/proc_1_environ +source /tmp/proc_1_environ +rm -f /tmp/proc_1_environ + #################################################################################################################### # Initialize environment variables #################################################################################################################### @@ -30,7 +36,8 @@ export MICROSERVICE_SCRIPT_HOME=${SCRIPT_HOME}/server export GLOBAL_VIRTUALENV_HOME=${ENVIRONMENT_HOME}/virtualenv if [ "$SERVER_LOCAL_IP" == "" ]; then - local_ip=$(ip -4 route | grep "link src" | awk -F"link src " '{print $2}' | awk '{print $1}' | head -n 1) + # local_ip=$(ip -4 route | grep "link src" | grep -v "docker" | grep -v "podman" | awk -F"link src " '{print $2}' | awk '{print $1}' | head -n 1) + local_ip=$(hostname -i) export SERVER_LOCAL_IP=$local_ip fi diff --git a/sysom_server/sysom_ad_proxy/app/schemas.py b/sysom_server/sysom_ad_proxy/app/schemas.py index 39dd54ed6502575c194a70189e8ca6716c1f1b90..49e9b84a9bc5de74ad07d64a734c8d95b1be9915 100644 --- a/sysom_server/sysom_ad_proxy/app/schemas.py +++ b/sysom_server/sysom_ad_proxy/app/schemas.py @@ -15,6 +15,10 @@ from enum import Enum from typing import List +class AlertLevel(str, Enum): + WARNING = "WARNING" + ERROR = "ERROR" + CRITICAL = "CRITICAL" class AlertType(str, Enum): MONITOR = "MONITOR" # 监控告警 @@ -50,6 +54,7 @@ class AlertData(BaseModel): alert_item: str alert_category: AlertType alert_source_type: str + alert_level: AlertLevel = AlertLevel.WARNING alert_time: int status: AlertStatus = AlertStatus.FIRING labels: dict = {} diff --git a/sysom_server/sysom_ad_proxy/lib/source_convert/grafana.py b/sysom_server/sysom_ad_proxy/lib/source_convert/grafana.py index 130e2cfba9190b867d3aafef40b3f06419972739..3190b9e9b7ea94b8db424a55901643ff2165c2a2 100644 --- a/sysom_server/sysom_ad_proxy/lib/source_convert/grafana.py +++ b/sysom_server/sysom_ad_proxy/lib/source_convert/grafana.py @@ -75,7 +75,7 @@ class SourceConverter(SourceConverterBase): alert_time=SourceConverterBase.iso_to_timestamp( alert.get("startsAt", "") ), - status=alert.get("status", AlertStatus.FIRING), + status=alert.get("status", AlertStatus.FIRING).upper(), labels=alert.get("labels", {}), annotations=alert.get("annotations", {}), origin_alert_data=alert, diff --git a/sysom_server/sysom_alarm/app/query.py b/sysom_server/sysom_alarm/app/query.py index a36bd152b69b93ebdc92a92902bcd6051fbfa8a2..6d3347c51cd11dbff6212fc9e5f6983897ba953d 100644 --- a/sysom_server/sysom_alarm/app/query.py +++ b/sysom_server/sysom_alarm/app/query.py @@ -6,7 +6,7 @@ Email mfeng@linux.alibaba.com File schemas.py Description: """ -from typing import Optional +from typing import Optional, List from app import models from sysom_utils import BaseQueryParams @@ -24,5 +24,11 @@ class AlertDataQueryParams(BaseQueryParams): alert_item: Optional[str] = None alert_category: Optional[str] = None alert_source_type: Optional[str] = None + alert_level: Optional[str] = None + labels: Optional[str] = None status: Optional[str] = None deal_status: Optional[str] = None + + # 4. 指定支持模糊查询的字段 + like_fields: List[str] = ["alert_item"] + json_fields: List[str] = ["labels", "annotations"] diff --git a/sysom_server/sysom_alert_pusher/alembic.ini b/sysom_server/sysom_alert_pusher/alembic.ini new file mode 100644 index 0000000000000000000000000000000000000000..f6ab9febcd93add9d0ea8857f857d7f30f1fe48f --- /dev/null +++ b/sysom_server/sysom_alert_pusher/alembic.ini @@ -0,0 +1,102 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library that can be +# installed by adding `alembic[tz]` to the pip requirements +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = "" + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/sysom_server/sysom_alert_pusher/alembic/README b/sysom_server/sysom_alert_pusher/alembic/README new file mode 100644 index 0000000000000000000000000000000000000000..98e4f9c44effe479ed38c66ba922e7bcc672916f --- /dev/null +++ b/sysom_server/sysom_alert_pusher/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/alembic/env.py b/sysom_server/sysom_alert_pusher/alembic/env.py new file mode 100644 index 0000000000000000000000000000000000000000..9c45eb47846ad0b2bb0b5392df8fc8eaa9f7d72f --- /dev/null +++ b/sysom_server/sysom_alert_pusher/alembic/env.py @@ -0,0 +1,115 @@ +import inspect +import app.models as models +from logging.config import fileConfig +from sqlalchemy import engine_from_config, Table +from sqlalchemy import pool +from app.models import Base +from alembic import context +from conf.settings import YAML_CONFIG, SQLALCHEMY_DATABASE_URL + +################################################################## +# Load yaml config first +################################################################## +mysql_config = YAML_CONFIG.get_server_config().db.mysql + +################################################################## +# Scan models +################################################################## +service_tables = [] +for name, data in inspect.getmembers(models): + if inspect.isclass(data): + if data.__module__ != "app.models": + continue + if "__tablename__" in data.__dict__: + service_tables.append(data.__dict__["__tablename__"]) + elif "__table__" in data.__dict__: + service_tables.append(data.__dict__["__table__"]) + elif isinstance(data, Table): + service_tables.append(name) + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# Update mysql config according config.yml +config.set_main_option( + "sqlalchemy.url", + SQLALCHEMY_DATABASE_URL +) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + +def include_object(object, name, type_, reflected, compare_to): + if type_ == "table" and name not in service_tables: + return False + return True + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + include_object=include_object, + version_table="alert_pusher_version", + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + include_object=include_object, + version_table="alert_pusher_version" + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/sysom_server/sysom_alert_pusher/alembic/script.py.mako b/sysom_server/sysom_alert_pusher/alembic/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..2c0156303a8df3ffdc9de87765bf801bf6bea4a5 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/sysom_server/sysom_alert_pusher/alembic/versions/.gitkeep b/sysom_server/sysom_alert_pusher/alembic/versions/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_alert_pusher/app/__init__.py b/sysom_server/sysom_alert_pusher/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..efb7beefd7871c7c14bd4f3848835bd881c67f1c --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File __init__.py +Description: +""" \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/app/crud.py b/sysom_server/sysom_alert_pusher/app/crud.py new file mode 100644 index 0000000000000000000000000000000000000000..ec0c0b769db7398aaaaa1613dba00f890a267682 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/crud.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File crud.py +Description: +""" +from typing import Optional, List +from sqlalchemy.orm import Session +from app import models, schemas, query + +################################################################################################ +# Define database crud here +################################################################################################ + +# def get_person_by_name(db: Session, name: str) -> Optional[models.Person]: +# return db.query(models.Person).filter(models.Person.name == name).first() + +# def create_person(db: Session, person: schemas.Person) -> models.Person: +# person = models.Person(**person.dict()) +# db.add(person) +# db.commit() +# db.refresh(person) +# return person + +# def del_person_by_id(db: Session, person_id: int): +# person = db.get(models.Person, person_id) +# db.delete(person) +# db.commit() + +# def get_person_list(db: Session, query_params: query.PersonQueryParams) -> List[models.Person]: +# return ( +# query_params.get_query_exp(db) +# .all() +# ) diff --git a/sysom_server/sysom_alert_pusher/app/database.py b/sysom_server/sysom_alert_pusher/app/database.py new file mode 100644 index 0000000000000000000000000000000000000000..39d6880c08aa365fdbd1c1e1c669a75314d0e652 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/database.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File database.py +Description: +""" +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from conf.settings import SQLALCHEMY_DATABASE_URL +from sysom_utils import FastApiResponseHelper + +engine = create_engine( + SQLALCHEMY_DATABASE_URL, connect_args={} +) + +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + +Base = declarative_base() + +FastApiResponseHelper.bind_base_class(Base) \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/app/executor.py b/sysom_server/sysom_alert_pusher/app/executor.py new file mode 100644 index 0000000000000000000000000000000000000000..9dbdd5a77880cddb92d737231dcb962bb8f2bc68 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/executor.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- # +""" +Time 2022/10/11 16:13 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File executor.py +Description: +""" +import time +import asyncio +from queue import Queue +from typing import Callable, Optional +from cec_base.event import Event +from cec_base.consumer import Consumer +from cec_base.cec_client import MultiConsumer, CecAsyncConsumeTask, StoppableThread +from clogger import logger +from importlib import import_module +from conf.settings import * +from sysom_utils import CecTarget + + +class AsyncMultiConsumer(MultiConsumer): + def __init__( + self, + url: str, + sync_mode: bool = False, + custom_callback: Callable[[Event, CecAsyncConsumeTask], None] = None, + **kwargs, + ): + super().__init__(url, sync_mode, custom_callback, **kwargs) + + # 执行任务的线程池数量 + self._task_process_thread: Optional[StoppableThread] = None + self._task_queue: Queue = Queue(maxsize=1000) + + def add_async_task(self, task: asyncio.Task): + self._task_queue.put(task) + + def _process_task(self): + def _get_task_from_queue(): + _tasks = [] + while not self._task_queue.empty(): + _task = self._task_queue.get_nowait() + if _task: + _tasks.append(_task) + else: + break + return _tasks + + tasks = _get_task_from_queue() + loop = asyncio.new_event_loop() + assert self._task_process_thread is not None + while not self._task_process_thread.stopped(): + if len(tasks) == 0: + time.sleep(0.1) + tasks = _get_task_from_queue() + continue + finished, unfinished = loop.run_until_complete( + asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED, timeout=0.5) + ) + for task in finished: + if task.exception() is not None: + logger.error(str(task.exception())) + else: + pass + tasks = _get_task_from_queue() + if unfinished is not None: + tasks += list(unfinished) + + def start(self): + super().start() + if ( + self._task_process_thread is not None + and not self._task_process_thread.stopped() + and self._task_process_thread.is_alive() + ): + return + self._task_process_thread = StoppableThread(target=self._process_task) + self._task_process_thread.setDaemon(True) + self._task_process_thread.start() + + +class AlertListener(AsyncMultiConsumer): + """A cec-based channel listener + + A cec-based channel lilster, ssed to listen to requests for channels from + other modules and output the results to cec after performing the corresponding + operation on the target node + + Args: + task_process_thread_num(str): The number of threads contained in the thread + pool used to execute the task + + """ + + def __init__(self) -> None: + super().__init__( + YAML_CONFIG.get_cec_url(CecTarget.PRODUCER), + custom_callback=self.on_receive_event, + ) + self.append_group_consume_task( + CEC_TOPIC_SYSOM_SAD_ALERT, + "sysom_alert_pusher", + Consumer.generate_consumer_id(), + ensure_topic_exist=True, + ) + + self.rules = {} + self.targets = {} + + # Initial all targets + for target_type, targets in service_config.get("push_targets", {}).items(): + target_class = self._get_push_target_class(target_type) + for target_name, target_config in targets.items(): + self.targets[f"{target_type}.{target_name}"] = target_class( + target_config + ) + + # Initial all rules + for rule_type, rules in service_config.get("push_rules", {}).items(): + rule_class = self._get_push_rule_class(rule_type) + for rule_name, rule_config in rules.items(): + self.rules[f"{rule_type}.{rule_name}"] = rule_class(rule_config) + + # 执行任务的线程池数量 + self._task_process_thread: Optional[StoppableThread] = None + self._task_queue: Queue = Queue(maxsize=1000) + + def _get_push_target_class(self, target_type): + """ + Get the push target class according to the target type + """ + try: + return getattr( + import_module(f"lib.targets.{target_type}"), + f"PushTarget{target_type.title()}", + ) + except Exception as e: + raise Exception(f"No channels available => {str(e)}") + + def _get_push_rule_class(self, rule_name): + """ + Get the push rule class according to the rule name + """ + try: + return getattr( + import_module(f"lib.rules.{rule_name}"), f"PushRule{rule_name.title()}" + ) + except Exception as e: + raise Exception(f"No rules available => {str(e)}") + + def _deal_recevied_data(self, data: dict): + """ + 处理接收到的数据 + """ + for rule in self.rules.values(): + if rule.is_match(data): + for target_name in rule.get_targets(): + target = self.targets.get(target_name) + if target is None: + logger.warning(f"Target not found, target = {target_name}") + continue + self.add_async_task( + target.push(data) + ) + + def on_receive_event(self, event: Event, task: CecAsyncConsumeTask): + """ + 处理每个单独的任务 + """ + event_value = event.value + try: + assert isinstance(event_value, dict) + if task.topic_name == CEC_TOPIC_SYSOM_SAD_ALERT: + self._deal_recevied_data(event_value) + else: + logger.warning( + f"Received not expect topic data, topic = {task.topic_name}" + ) + except Exception as e: + logger.exception(e) + finally: + # 执行消息确认 + task.ack(event) diff --git a/sysom_server/sysom_alert_pusher/app/models.py b/sysom_server/sysom_alert_pusher/app/models.py new file mode 100644 index 0000000000000000000000000000000000000000..25cbe1c14a62772b981d79b34760fe3d07cea259 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/models.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File models.py +Description: +""" +from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy.sql import func +from app.database import Base + + +########################################################################### +# Define databse model here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/sql-databases/ +# class Person(Base): +# __tablename__ = "sys_person" +# id = Column(Integer, primary_key=True) +# name = Column(String(254), unique=True) +# age = Column(Integer) +# created_at = Column(DateTime(timezone=True), server_default=func.now()) \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/app/query.py b/sysom_server/sysom_alert_pusher/app/query.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbcd0898fc8d3f2be8d64ce694d7aa99ccb2332 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/query.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/09/19 15:41 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from typing import Optional +from app import models +from sysom_utils import BaseQueryParams + + +# class PersonQueryParams(BaseQueryParams): + +# # 1. 指定要查询的模型 +# __modelclass__ = models.Person + +# # 2. 定义排序字段 +# sort: str = "-created_at" + +# # 3. 定义支持用于过滤的参数 +# name: Optional[str] = None +# age: Optional[str] = None + +# # 4. 指定哪些字段是枚举类型,并且指明对应的枚举类 +# __enum_fields__ = { +# } \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/app/routers/health.py b/sysom_server/sysom_alert_pusher/app/routers/health.py new file mode 100644 index 0000000000000000000000000000000000000000..41ca2ba30c158001ada22f7994b6713592af043e --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/routers/health.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from clogger import logger +from typing import Dict +from fastapi import APIRouter, Body + + +router = APIRouter() + + +@router.get("/check") +async def helth_check(): + return { + "code": 0, + "err_msg": "", + "data": "" + } + +@router.post("/dummy_echo") +async def dummy_echo(item: Dict = Body(...)): + return { + "code": 0, + "err_msg": "", + "data": item + } diff --git a/sysom_server/sysom_alert_pusher/app/schemas.py b/sysom_server/sysom_alert_pusher/app/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..56dd69e3383760cc88ab6e3a925f245b339416bc --- /dev/null +++ b/sysom_server/sysom_alert_pusher/app/schemas.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from pydantic import BaseModel +from enum import Enum + +########################################################################### +# Define schemas here +########################################################################### + +from typing import List + +class AlertLevel(str, Enum): + WARNING = "WARNING" + ERROR = "ERROR" + CRITICAL = "CRITICAL" + +class AlertType(str, Enum): + MONITOR = "MONITOR" # 监控告警 + APPLICATION = "APPLICATION" # 应用告警 + OTHER = "OTHER" # 其它类型告警 + + +class AlertStatus(str, Enum): + NORMAL = "NORMAL" + PENDING = "PENDING" + FIRING = "FIRING" + RESOLVED = "RESOLVED" + + +class AlertData(BaseModel): + """SysOM Alert data format definition + + Attributes: + alert_id(str): 告警ID(使用 uuid v4 生成) + instance(str): 告警实例 + alert_item(str): 告警项,用于唯一标识一类告警,比如每个告警规则可以对应一个告警项 + alert_category(AlertType): 告警类别 + alert_source_type(str): 告警源类型,例如:Grafana、Alert + alert_time(int): 告警发生时间,采用时间戳,单位为 ms + status(AlertStatus): 告警状态 normal -> pending -> firing -> resolved + labels(dict): 告警标签 + annotations(dict): 告警注释 + origin_alert_data: 原始告警数据 + """ + + alert_id: str + instance: str + alert_item: str + alert_category: AlertType + alert_source_type: str + alert_level: AlertLevel = AlertLevel.WARNING + alert_time: int + status: AlertStatus = AlertStatus.FIRING + labels: dict = {} + annotations: dict = {} + origin_alert_data: dict = {} \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/conf/common.py b/sysom_server/sysom_alert_pusher/conf/common.py new file mode 100644 index 0000000000000000000000000000000000000000..654f9c6970bb1692b77a83849d522716e3651de8 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/common.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File common.py +Description: +""" +from pathlib import Path +from sysom_utils import ConfigParser, SysomFramework, CecTarget + +BASE_DIR = Path(__file__).resolve().parent.parent + +################################################################## +# Load yaml config first +################################################################## +YAML_GLOBAL_CONFIG_PATH = f"{BASE_DIR.parent.parent}/conf/config.yml" +YAML_SERVICE_CONFIG_PATH = f"{BASE_DIR}/config.yml" + +YAML_CONFIG = ConfigParser(YAML_GLOBAL_CONFIG_PATH, YAML_SERVICE_CONFIG_PATH) + +mysql_config = YAML_CONFIG.get_server_config().db.mysql +service_config = YAML_CONFIG.get_service_config() + +SysomFramework.init(YAML_CONFIG) + +################################################################## +# fastapi config +################################################################## +SQLALCHEMY_DATABASE_URL = ( + f"{mysql_config.dialect}+{mysql_config.engine}://{mysql_config.user}:{mysql_config.password}@" + f"{mysql_config.host}:{mysql_config.port}/{mysql_config.database}" +) + +################################################################## +# Cec settings +################################################################## +SYSOM_CEC_URL = YAML_CONFIG.get_cec_url(CecTarget.PRODUCER) +# 告警中心接收 SAD 格式告警的主题 +CEC_TOPIC_SYSOM_SAD_ALERT = "SYSOM_SAD_ALERT" diff --git a/sysom_server/sysom_alert_pusher/conf/develop.py b/sysom_server/sysom_alert_pusher/conf/develop.py new file mode 100644 index 0000000000000000000000000000000000000000..f4fc3307afde2cb238017722f3b4866677e9b18f --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/develop.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File develoop.py +Description: +""" +from .common import * + +''' +开发环境配置项 +''' + +DEBUG = True \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/conf/gunicorn.py b/sysom_server/sysom_alert_pusher/conf/gunicorn.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e14e8447a85d6777d4a6f5ba12167aac7fe7a3 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/gunicorn.py @@ -0,0 +1,23 @@ +''' +Channel Service Gunicorn Settings +''' +from conf.common import YAML_CONFIG + +bind = YAML_CONFIG.get_service_config().get("bind", "127.0.0.1") +port = YAML_CONFIG.get_service_config().get("port", "80") + +workers = 2 # 指定工作进程数 + +threads = 3 + +bind = f'{bind}:{port}' + +worker_class = 'uvicorn.workers.UvicornWorker' # 工作模式线程, 默认为sync模式 + +max_requests = 2000 # 设置最大并发数量为2000 (每个worker处理请求的工作线程) + +accesslog = '/var/log/sysom/sysom-alert_pusher-access.log' + +loglevel = 'error' + +proc_name = 'sysom_alert_pusher_service' diff --git a/sysom_server/sysom_alert_pusher/conf/product.py b/sysom_server/sysom_alert_pusher/conf/product.py new file mode 100644 index 0000000000000000000000000000000000000000..c64d40084ca3cd0246499cc7ebf16f626f85698d --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/product.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File product.py +Description: +""" +from .common import * + +''' +生产环境配置项 +''' + +DEBUG = False diff --git a/sysom_server/sysom_alert_pusher/conf/settings.py b/sysom_server/sysom_alert_pusher/conf/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c0335aa7886699b5908aa9736c22a5f8224f95 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/settings.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File settings.py +Description: +""" +import os + +env = os.environ.get("env", "product") + + +if env == "develop": + from .develop import * +elif env == "testing": + from .testing import * +elif env == "product": + from .product import * \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/conf/testing.py b/sysom_server/sysom_alert_pusher/conf/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..1694474603b5e08cdd80ff8a068ffde1674a5137 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/conf/testing.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File testing.py +Description: +""" +from .common import * + +''' +测试环境配置项 +''' +DEBUG = True diff --git a/sysom_server/sysom_alert_pusher/config.yml b/sysom_server/sysom_alert_pusher/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..88b375f3b6216bc204cab4333fcfdf5c780cbda2 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/config.yml @@ -0,0 +1,58 @@ +vars: + SERVICE_NAME: &SERVICE_NAME sysom_alert_pusher + SERVICE_CONSUMER_GROUP: + !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] + +sysom_server: + cec: + consumer_group: *SERVICE_CONSUMER_GROUP + +sysom_service: + service_name: *SERVICE_NAME + service_dir: *SERVICE_NAME + protocol: http + host: 127.0.0.1 + bind: 127.0.0.1 + port: 7018 + framework: + gcache: + protocol: redis + node_dispatch: + cmg: + tags: + - alert_pusher + - FastApi + # Metadata of service + metadata: + check: + type: http + url: "/api/v1/alert_pusher/health/check" + interval: 10 + timeout: 10 + deregister: 25 + header: + tls_skip_verify: false + push_rules: + regex: + default: + rules: + alert_item: sysload_(.*?) + targets: + - webhook.duty_robot + dintalk: + rules: + alert_item: kmsg + targets: + - webhook.dingtalk + push_targets: + webhook: + duty_robot: + url: http://localhost:7018/api/v1/alert_pusher/health/dummy_echo + method: POST + headers: + Content-Type: application/json + dingtalk: + url: http://localhost:7019/api/v1/dingtalk/webhook_post + method: POST + headers: + Content-Type: application/json diff --git a/sysom_server/sysom_alert_pusher/lib/README.md b/sysom_server/sysom_alert_pusher/lib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ec74424ba6ecde5d035d0c113bafb0ddc6e4cfa --- /dev/null +++ b/sysom_server/sysom_alert_pusher/lib/README.md @@ -0,0 +1 @@ +The current directory holds the public libraries or utils needed for microservices \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/lib/rule_engine.py b/sysom_server/sysom_alert_pusher/lib/rule_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_alert_pusher/lib/rules/base.py b/sysom_server/sysom_alert_pusher/lib/rules/base.py new file mode 100644 index 0000000000000000000000000000000000000000..bca9f7bd6a336aa9df657f5a216de3338787255a --- /dev/null +++ b/sysom_server/sysom_alert_pusher/lib/rules/base.py @@ -0,0 +1,28 @@ +from abc import ABCMeta, abstractmethod +from app.schemas import AlertData +from typing import List + +class PushRuleBase(metaclass=ABCMeta): + def __init__(self, config: dict) -> None: + self.config = config + self.targets = config.get("targets", []) + + @abstractmethod + def is_match(self, data: dict) -> bool: + """Return whether the rule is matched. + + Args: + data (dict): _description_ + """ + pass + + def is_match_alert_data(self, alert_data: AlertData) -> bool: + """Return whether the rule is matched. + + Args: + alert_data (AlertData): _description_ + """ + return self.is_match(alert_data.dict()) + + def get_targets(self) -> List[str]: + return self.targets \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/lib/rules/regex.py b/sysom_server/sysom_alert_pusher/lib/rules/regex.py new file mode 100644 index 0000000000000000000000000000000000000000..52e942b4e0befc5582432298b49a661dc25d0b61 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/lib/rules/regex.py @@ -0,0 +1,22 @@ +import re +from clogger import logger +from sysom_utils.adddict import Dict +from .base import PushRuleBase + + +class PushRuleRegex(PushRuleBase): + def __init__(self, config: dict) -> None: + super().__init__(config) + self.rules = config.get("rules", {}) + + def is_match(self, data: dict) -> bool: + data = Dict(data) + for field, parttern in self.rules.items(): + value = data.get_multi(field) + if value is None: + logger.warning(f"PushRuleRegex: Field not found, field = {field}") + return False + if re.match(parttern, value) is None: + logger.warning(f"PushRuleRegex: Regex not match, field = {field}, parttern = {parttern}") + return False + return True \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/lib/targets/base.py b/sysom_server/sysom_alert_pusher/lib/targets/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c718e4cb0acadaf3577d96cc57ca4f203f140f56 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/lib/targets/base.py @@ -0,0 +1,16 @@ +from abc import ABCMeta, abstractmethod +from app.schemas import AlertData + +class PushTargetBase(metaclass=ABCMeta): + + def __init__(self, config: dict) -> None: + self.config = config + + async def push_alert_data(self, alert_data: AlertData): + """Push alert data to the target. + """ + await self.push(alert_data.dict()) + + @abstractmethod + async def push(self, data: dict): + pass \ No newline at end of file diff --git a/sysom_server/sysom_alert_pusher/lib/targets/webhook.py b/sysom_server/sysom_alert_pusher/lib/targets/webhook.py new file mode 100644 index 0000000000000000000000000000000000000000..5ddb969ca30bb9f6056f520dadeee0466a04cfe9 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/lib/targets/webhook.py @@ -0,0 +1,19 @@ +import aiohttp +from .base import PushTargetBase + + +class PushTargetWebhook(PushTargetBase): + def __init__(self, config: dict) -> None: + self.url = config.get("url", "") + self.method = config.get("method", "POST") + self.headers = config.get("headers", {}) + self.rules = config.get("rules", []) + + async def push(self, data: dict): + async with aiohttp.ClientSession() as session: + async with session.request( + self.method, self.url, headers=self.headers, json=data + ) as resp: + if resp.status != 200: + raise Exception(f"Webhook send failed, status = {resp.status}") + return await resp.json() diff --git a/sysom_server/sysom_alert_pusher/main.py b/sysom_server/sysom_alert_pusher/main.py new file mode 100644 index 0000000000000000000000000000000000000000..398246dbdb5128d752a7b0b3552ebd0b5cfa1660 --- /dev/null +++ b/sysom_server/sysom_alert_pusher/main.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/23 19:11 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File ssh.py +Description: +""" +from clogger import logger +from fastapi import FastAPI +from app.routers import health +from conf.settings import YAML_CONFIG +from sysom_utils import CmgPlugin, SysomFramework +from app.executor import AlertListener + + +app = FastAPI() + +app.include_router(health.router, prefix="/api/v1/alert_pusher/health") +# app.include_router(health.router, prefix="/api/v1/alert_pusher/person") + + +############################################################################# +# Write your API interface here, or add to app/routes +############################################################################# + + +def init_framwork(): + SysomFramework\ + .init(YAML_CONFIG) \ + .load_plugin_cls(CmgPlugin) \ + .start() + logger.info("SysomFramework init finished!") + + +@app.on_event("startup") +async def on_start(): + init_framwork() + + ############################################################################# + # Perform some microservice initialization operations over here + ############################################################################# + + AlertListener().start() + + +@app.on_event("shutdown") +async def on_shutdown(): + pass \ No newline at end of file diff --git a/sysom_server/sysom_api/apps/accounts/views.py b/sysom_server/sysom_api/apps/accounts/views.py index ceef772bfe53d8411dba9d0a3e7ce850e3b34478..efb169c9b4f82bfa5bff0c86024d959e60c60242 100644 --- a/sysom_server/sysom_api/apps/accounts/views.py +++ b/sysom_server/sysom_api/apps/accounts/views.py @@ -105,10 +105,10 @@ class UserModelViewSet( option = getattr(models.HandlerOptionEnum, request_option.upper()).value if option is not None: kwargs['request_option'] = option - request_ip = params.get('request_ip', None) - request_url = params.get('request_url', None) - request_method: str = params.get('request_method', None) - response_status = params.get('response_status', None) + request_ip = params.get('request_ip', "localhost") + request_url = params.get('request_url', "") + request_method: str = params.get('request_method', "") + response_status = params.get('response_status', "") start_time = params.get('startTime', '2000-01-01 00:00:00') end_time = params.get('endTime', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) diff --git a/sysom_server/sysom_channel/app/executor.py b/sysom_server/sysom_channel/app/executor.py index 8cce15142d269931633526c05e1176b5207f477c..4d2d8ca5210e0cb34360230f6bbca8a3d8f96942 100644 --- a/sysom_server/sysom_channel/app/executor.py +++ b/sysom_server/sysom_channel/app/executor.py @@ -80,36 +80,36 @@ class ChannelListener(MultiConsumer): Use the specified channel to perform operations on the remote node and return the results. """ - async def _try_another_channel(result: ChannelResult): - channels_path = os.path.join(BASE_DIR, 'lib', 'channels') - packages = [dir.replace('.py', '') for dir in os.listdir( - channels_path) if not dir.startswith('__')] - packages.remove('base') - packages.remove(default_channel) - err = None - for _, pkg in enumerate(packages): - try: - result = await opt_func(pkg, task) - err = None - break - except Exception as exc: - logger.error(str(exc)) - err = exc - return result, err + # async def _try_another_channel(result: ChannelResult): + # channels_path = os.path.join(BASE_DIR, 'lib', 'channels') + # packages = [dir.replace('.py', '') for dir in os.listdir( + # channels_path) if not dir.startswith('__')] + # packages.remove('base') + # packages.remove(default_channel) + # err = None + # for _, pkg in enumerate(packages): + # try: + # result = await opt_func(pkg, task) + # err = None + # break + # except Exception as exc: + # logger.error(str(exc)) + # err = exc + # return result, err result, err = ChannelResult(code=1), None try: result = await opt_func(default_channel, task) - if result.code != 0: - result, inner_err = await _try_another_channel(result) - if inner_err is not None: - err = inner_err + # if result.code != 0: + # result, inner_err = await _try_another_channel(result) + # if inner_err is not None: + # err = inner_err except Exception as exc: logger.error(str(exc)) err = exc - result, inner_err = await _try_another_channel(result) - if inner_err is not None: - err = inner_err + # result, inner_err = await _try_another_channel(result) + # if inner_err is not None: + # err = inner_err if err is not None: raise err return result diff --git a/sysom_server/sysom_cluster_health/alembic.ini b/sysom_server/sysom_cluster_health/alembic.ini new file mode 100644 index 0000000000000000000000000000000000000000..f6ab9febcd93add9d0ea8857f857d7f30f1fe48f --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic.ini @@ -0,0 +1,102 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library that can be +# installed by adding `alembic[tz]` to the pip requirements +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = "" + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/sysom_server/sysom_cluster_health/alembic/README b/sysom_server/sysom_cluster_health/alembic/README new file mode 100644 index 0000000000000000000000000000000000000000..98e4f9c44effe479ed38c66ba922e7bcc672916f --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/alembic/env.py b/sysom_server/sysom_cluster_health/alembic/env.py new file mode 100644 index 0000000000000000000000000000000000000000..bee7999ad102fd560f883a2b97b45febf4af989c --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic/env.py @@ -0,0 +1,115 @@ +import inspect +import app.models as models +from logging.config import fileConfig +from sqlalchemy import engine_from_config, Table +from sqlalchemy import pool +from app.models import Base +from alembic import context +from conf.settings import YAML_CONFIG, SQLALCHEMY_DATABASE_URL + +################################################################## +# Load yaml config first +################################################################## +mysql_config = YAML_CONFIG.get_server_config().db.mysql + +################################################################## +# Scan models +################################################################## +service_tables = [] +for name, data in inspect.getmembers(models): + if inspect.isclass(data): + if data.__module__ != "app.models": + continue + if "__tablename__" in data.__dict__: + service_tables.append(data.__dict__["__tablename__"]) + elif "__table__" in data.__dict__: + service_tables.append(data.__dict__["__table__"]) + elif isinstance(data, Table): + service_tables.append(name) + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# Update mysql config according config.yml +config.set_main_option( + "sqlalchemy.url", + SQLALCHEMY_DATABASE_URL +) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + +def include_object(object, name, type_, reflected, compare_to): + if type_ == "table" and name not in service_tables: + return False + return True + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + include_object=include_object, + version_table="cluster_health_version", + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + include_object=include_object, + version_table="cluster_health_version" + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/sysom_server/sysom_cluster_health/alembic/script.py.mako b/sysom_server/sysom_cluster_health/alembic/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..2c0156303a8df3ffdc9de87765bf801bf6bea4a5 --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/sysom_server/sysom_cluster_health/alembic/versions/.gitkeep b/sysom_server/sysom_cluster_health/alembic/versions/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py b/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py new file mode 100644 index 0000000000000000000000000000000000000000..bf5a0a44f9b5f8ee37ee53a371e1a19e7dfc1309 --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py @@ -0,0 +1,66 @@ +"""cluster_health + +Revision ID: f2217aef0227 +Revises: +Create Date: 2024-03-11 14:16:58.653466 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'f2217aef0227' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('sys_abnormal_metrics_cluster', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + op.create_table('sys_abnormal_metrics_node', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.Column('instance', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + op.create_table('sys_abnormal_metrics_pod', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.Column('instance', sa.String(length=256), nullable=True), + sa.Column('pod', sa.String(length=256), nullable=True), + sa.Column('namespace', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('sys_abnormal_metrics_pod') + op.drop_table('sys_abnormal_metrics_node') + op.drop_table('sys_abnormal_metrics_cluster') + # ### end Alembic commands ### diff --git a/sysom_server/sysom_cluster_health/app/__init__.py b/sysom_server/sysom_cluster_health/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fc98ac58c2bd324f93f841f305d9f9d9ded93d1e --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File __init__.py +Description: +""" \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/collector/collector.py b/sysom_server/sysom_cluster_health/app/collector/collector.py new file mode 100644 index 0000000000000000000000000000000000000000..e6a4282dabeb7898e4f3e5d7f5f8bcab5d64553f --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/collector.py @@ -0,0 +1,172 @@ +import time +from conf.settings import * +from multiprocessing import Queue, Process +from schedule import Scheduler +from os import getpid, kill +from typing import Dict +from clogger import logger +from app.collector.metric_manager import MetricManager +from app.collector.metric_exception import MetricProcessException +from lib.common_type import Labels, Level +from lib.utils import collect_all_clusters, collect_instances_of_cluster, \ + collect_pods_of_instance + + +class Collector(Process): + def __init__( + self, + queue: Queue = None, + metric_manager: MetricManager = None, + parent_pid: int = None + ) -> None: + super().__init__(daemon=True) + self.collect_interval = COLLECT_INTERVAL + self.collector_host_schedule: Scheduler = Scheduler() + self.metric_manager = metric_manager + self.last_end_time = time.time() - self.collect_interval + self.last_alarm_table: Dict[str, int] = {} + self.queue = queue + self.parent_pid = parent_pid + + def _check_if_parent_is_alive(self): + try: + kill(self.parent_pid, 0) + except OSError: + logger.info(f"Analyzer's parent {self.parent_pid} is exit") + exit(0) + + def _deliver_one_alarm(self, metric, labels: Labels, level: Level, + score: float, value: float): + if metric.settings.alarm is None: + return + + type = metric.settings.type + threshold = metric.settings.alarm.threshold + metric_id = metric.settings.metric_id + + key = f"{labels.cluster}-{labels.instance}-{labels.namespace}" \ + f"-{labels.pod}-{metric_id}" + + # score lower than threshold, deliver alarm + if score <= threshold: + if key not in self.last_alarm_table: + self.last_alarm_table[key] = 0 + + continue_alarm = self.last_alarm_table[key] + # first alarm, deliver it and raise diagnose + if continue_alarm == 0: + alart_id = metric.deliver_alarm(value, type) + metric.deliver_diagnose(alart_id, level, type, self.queue) + + self.last_alarm_table[key] += 1 + + # if alarm list is longer than MERGE_NUM, resend the alarms + if continue_alarm > ALARM_MERGE_NUM: + self.last_alarm_table[key] = 0 + else: + # if continuesly alarm end, reset the alarm list + if key in self.last_alarm_table: + del self.last_alarm_table[key] + + def _collect_process_one(self, level: Level, labels: Labels): + """Collect and process one cluster/node/pod's all metrics + + Args: + level (Level): cluster/node/pod + labels (Labels): cluster/node/pod labels + """ + + for metric in self.metric_manager.registed_metric[level]: + try: + value, score = metric.metric_score(labels, self.last_end_time) + except MetricProcessException as e: + logger.info(f"Calculate Metric: {metric.settings.metric_id} " + f"of Pod: {labels.pod} of Node: {labels.instance} " + f"of Cluster: {labels.cluster} failed {e}") + continue + + metric.deliver_health_metric(value, score) + self._deliver_one_alarm(metric, labels, level, score, value) + + def _collect_process_pod_metric(self, name:str, ns:str, + level: Level, labels: Labels): + pod_label = Labels( + cluster=labels.cluster, + instance=labels.instance, + pod=name, + namespace=ns + ) + + # collect and process a pod's all metrics + self._collect_process_one(level, pod_label) + + def _collect_process_node_metric(self, name:str, level: Level, + labels: Labels): + pods_list = [] + node_label = Labels(cluster=labels.cluster, instance=name) + + # collect and process a node's all metrics + self._collect_process_one(level, node_label) + + pods_list = collect_pods_of_instance( + name, + self.metric_manager.metric_reader, + self.collect_interval + ) + + for pod, ns in pods_list: + self._collect_process_pod_metric(pod, ns, Level.Pod, node_label) + + + def _collect_process_cluster_metric(self, cluster: str): + level = Level.Cluster + labels = Labels(cluster=cluster) + nodes_list = [] + + # collect and process a cluster's all metrics + self._collect_process_one(level, labels) + + nodes_list = collect_instances_of_cluster( + cluster, + self.metric_manager.metric_reader, + self.collect_interval + ) + + for node in nodes_list: + self._collect_process_node_metric(node, Level.Node, labels) + + + def _register_task(self): + cluster_list = [] + + cluster_list = collect_all_clusters(self.metric_manager.metric_reader) + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or NO_CLUSTER_LABEL is True: + cluster_list.append("default") + + start_time = time.time() + + for cluster in cluster_list: + self._collect_process_cluster_metric(cluster) + + end_time = time.time() + self.last_end_time = end_time + logger.info(f"Excutaion time: {end_time - start_time}") + + + def run(self) -> None: + logger.info(f'健康度内置指标采集分析守护进程PID: {getpid()}') + + self._register_task() + self.collector_host_schedule.every(self.collect_interval)\ + .seconds.do(self._register_task) + + while True: + self._check_if_parent_is_alive(); + + if self.is_alive(): + self.collector_host_schedule.run_pending() + else: + break + time.sleep(max(1, int(self.collect_interval / 2))) + diff --git a/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py new file mode 100644 index 0000000000000000000000000000000000000000..076f7d2599e72abb8fc74d4d873e892ad66b31d8 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py @@ -0,0 +1,51 @@ +import conf.settings as settings +from metric_reader import MetricReader +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ + MetricCollectException, MetricProcessException + +NODE_LABEL = settings.NODE_LABEL + + +# for node cpu util, need to use(1 - sysom_proc_cpu_total{mode="idle"}) to cal +class NodeCpuUtil(CapacityMetric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self): + if self.level != Level.Node: + raise MetricSettingsException( + f'{self.settings.collect.metric_name} is a node level metric!' + ) + + value = self.settings.collect.related_value[0] + query_args = { + NODE_LABEL: self.name[Level.Node], + self.settings.collect.node_tag_name: value + } + + res = self._get_custom_metric(self.settings.collect.metric_name, + **query_args) + + if len(res.data) <= 0: + raise MetricCollectException( + f"Collect {self.settings.collect.metric_name}, Level:" + f" {self.level} from Prometheus failed!" + ) + + try: + max_values = [] + for i in range(len(res.data)): + values = res.data[i].to_dict()["values"] + # 区间值运算 + max_value = max( + (100 - float(value[1])) for value in values + ) + max_values.append(max_value) + final_value = max(max_values) + except Exception as exc: + raise MetricProcessException from exc + + return final_value diff --git a/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py new file mode 100644 index 0000000000000000000000000000000000000000..06fb39896e3a832597a1f85e50e2b8630e57a235 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py @@ -0,0 +1,44 @@ +from typing import Dict +from metric_reader import MetricReader +from sysom_utils import SysomFramework +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_type.metric_type import DiagnoseInfo + + +class NodeFdUtil(CapacityMetric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self): + return self._usage_total_process(is_avaliable=False) + + def construct_diagnose_req( + self, diagnose_info: DiagnoseInfo) -> Dict[str, str]: + threshold = "80" + command = "sysak sysctl " + "fd " + str(threshold) + + return { + "service_name": self.settings.alarm.service_name, + "instance": diagnose_info.instance.split(":")[0], + "command": command, + "channel": "ssh", + } + + def process_diagnose_req(self, diagnose_info: DiagnoseInfo, data_dict): + result = data_dict["CommandResult"]["data"][0]["value"] + lines = result.split("\n")[1:-1] + + SysomFramework.alarm_action("ADD_ANNOTATION", { + "alert_id": diagnose_info.alarm_id, + "annotations": { + "节点fd使用量top10进程": lines, + "修复建议": [ + "1.重启进程释放其占用的fd", + "2.通过ulimit调整进程的fd上限", + "3.调整系统的全局fd上限", + "4.检查以上进程是否存在fd泄露" + ] + } + }) diff --git a/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py new file mode 100644 index 0000000000000000000000000000000000000000..f3c13854c8c2e8a2efe7691b1bfdbf16523c242b --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py @@ -0,0 +1,34 @@ +import conf.settings as settings +from metric_reader import MetricReader +from lib.common_type import Level +from app.collector.metric_exception import MetricCollectException +from app.collector.metric_type.load import LoadMetric + +NODE_LABEL = settings.NODE_LABEL +CPU_COUNT_METRIC = "sysom_proc_cpus" + + +class NodeLoadAvg(LoadMetric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self): + return self._default_single_gauge() + + # 对于节点的load average,需要根据CPU核数来确定影响,所以自定义calculate_score方法 + def _calculate_score(self, metric_value: float) -> float: + def _count_node_cpus(): + query_args = {NODE_LABEL: self.name[Level.Node], "mode": "idle"} + res = self._get_custom_metric(CPU_COUNT_METRIC, **query_args) + if len(res.data) <= 0: + raise MetricCollectException( + f"Get {CPU_COUNT_METRIC} metric failed!") + + return len(res.data) + + cpu_num = _count_node_cpus() + res = self.score_interp(metric_value / cpu_num) + score = round(float(res.tolist()), 2) + + return score diff --git a/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py new file mode 100644 index 0000000000000000000000000000000000000000..07e710008d2664e5b96f16fcfb529041981a7107 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py @@ -0,0 +1,62 @@ +import conf.settings as settings +from metric_reader import MetricReader +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ + MetricCollectException, MetricProcessException + +MOUNTPOINT = "/" +NODE_LABEL = settings.NODE_LABEL + + +# node roootfs inode util = (100 - f_favail/f_files * 100) +class NodeRootfsInodeUtil(CapacityMetric): + def __init__(self, metric_reader: MetricReader, + metric_settings, level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self): + metric_name = self.settings.collect.metric_name + node_tag = self.settings.collect.node_tag_name + + if self.level != Level.Node: + raise MetricSettingsException( + f'{metric_name} is a node level metric!' + ) + + avail_value = self.settings.collect.related_value[0] + files_value = self.settings.collect.related_value[1] + query_args = { + NODE_LABEL: self.name[Level.Node], + node_tag: avail_value, + "mount": MOUNTPOINT + } + + favail_res = self._get_custom_metric(metric_name, + **query_args) + + query_args[node_tag] = files_value + files_res = self._get_custom_metric(metric_name, + **query_args) + + if not len(favail_res.data) == len(files_res.data) == 1: + raise MetricCollectException( + f"Collect {metric_name}," + f" Level: {self.level} from Prometheus failed!" + ) + + try: + max_favail = max( + [float(item[1]) + for item in favail_res.data[0].to_dict()["values"]] + ) + max_files = max( + [float(item[1]) + for item in files_res.data[0].to_dict()["values"]] + ) + final_value = (100 - max_favail / max_files * 100) + + except Exception as exc: + raise MetricProcessException from exc + + return final_value diff --git a/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py new file mode 100644 index 0000000000000000000000000000000000000000..52b7f6177a3e1a21edec28350a3adf53675b94d5 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py @@ -0,0 +1,76 @@ +import conf.settings as settings +from metric_reader import MetricReader +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ + MetricCollectException, MetricProcessException + +MOUNTPOINT = "/" +NODE_LABEL = settings.NODE_LABEL + + +# node roootfs util = (100 - f_bavail/(f_blokcs - f_bfree + f_bavail) * 100) +class NodeRootfsUtil(CapacityMetric): + def __init__(self, metric_reader: MetricReader, + metric_settings, level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self): + metric_name = self.settings.collect.metric_name + node_tag = self.settings.collect.node_tag_name + + if self.level != Level.Node: + raise MetricSettingsException( + f"{self.settings.collect.metric_name} is a " + f"node level metric!" + ) + + avail_value = self.settings.collect.related_value[0] + blocks_value = self.settings.collect.related_value[1] + bfree_balue = self.settings.collect.related_value[2] + query_args = { + NODE_LABEL: self.name[Level.Node], + node_tag: avail_value, + "mount": MOUNTPOINT + } + + bavail_res = self._get_custom_metric(metric_name, + **query_args) + + query_args[node_tag] = blocks_value + blocks_res = self._get_custom_metric(metric_name, + **query_args) + + query_args[node_tag] = bfree_balue + bfree_res = self._get_custom_metric(metric_name, + **query_args) + + # 因为是节点指标,查询出来的data应该只有1个 + if not (len(bavail_res.data) == len(blocks_res.data) + == len(bfree_res.data) == 1): + raise MetricCollectException( + f"Collect {metric_name}," + " Level: {self.level} from Prometheus failed!" + ) + + try: + # 都取区间向量中点的最大值 + max_bavail = max( + [float(item[1]) + for item in bavail_res.data[0].to_dict()["values"]] + ) + max_blocks = max( + [float(item[1]) + for item in blocks_res.data[0].to_dict()["values"]] + ) + max_bfree = max( + [float(item[1]) + for item in bfree_res.data[0].to_dict()["values"]] + ) + final_value = (100 - max_bavail / (max_blocks - + max_bfree + max_bavail) * 100) + + except Exception as exc: + raise MetricProcessException from exc + + return final_value diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_exception.py b/sysom_server/sysom_cluster_health/app/collector/metric_exception.py new file mode 100644 index 0000000000000000000000000000000000000000..71bdbeb2abd030514b550605470cedaadc9e1bca --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_exception.py @@ -0,0 +1,28 @@ +class MetricException(Exception): + """Metric base exception + + This class defines the base exception for Cluster Health Calculator, + and all exceptions thrown during health calculator should inherit + from this class. + """ + + +class MetricSettingsException(MetricException): + """Exceptions thrown for Metric settings + + This exception should be thrown if the Metric has invalid settings. + """ + + +class MetricCollectException(MetricException): + """Exceptions thrown for Metric collection + + This exception should be thrown if the Metric collection fails. + """ + + +class MetricProcessException(MetricException): + """Exceptions thrown for Metric process + + This exception should be thrown if the Metric process fails. + """ diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_manager.py b/sysom_server/sysom_cluster_health/app/collector/metric_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..f7ad068d471e0384cc5d9815d6725d344a65b3f6 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_manager.py @@ -0,0 +1,73 @@ +import importlib +from conf.settings import * +from app.collector.metric_exception import MetricSettingsException +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_type.load import LoadMetric +from app.collector.metric_type.latency import LatencyMetric +from app.collector.metric_type.error import ErrorMetric +from lib.common_type import Level +from metric_reader import dispatch_metric_reader + +CUSTOM_METRIC_DIR = "app.collector.custom_metric" + +METRIC_TYPE = { + "CapacityMetric": CapacityMetric, + "LoadMetric": LoadMetric, + "LatencyMetric": LatencyMetric, + "ErrorMetric": ErrorMetric +} + +class MetricManager(): + def __init__(self): + self.registed_metric = {} + self.metric_reader = dispatch_metric_reader( + "prometheus://" + PROMETHEUS_CONFIG.host + ":" + str(PROMETHEUS_CONFIG.port)) + + def _metric_register(self, all_metrics, level): + self.registed_metric[level] = [] + + for metric in all_metrics: + try: + if "filename" in metric["Collect"] \ + and metric["Collect"]["filename"] != "": + # load non_standard metric from file + filename = metric["Collect"]["filename"] + module_name = f'{CUSTOM_METRIC_DIR}.{filename}' + class_name = filename.title().replace("_", "") + try: + metric_module = importlib.import_module(module_name) + if hasattr(metric_module, class_name): + metric_class = getattr(metric_module, class_name) + metric_instance = metric_class(self.metric_reader, + metric, level) + except ModuleNotFoundError as exc: + raise MetricSettingsException( + f"{module_name} not exist!" + ) from exc + except MetricSettingsException as exc: + raise exc + + else: + metric_class = METRIC_TYPE.get(metric["Type"]) + if metric_class is not None: + metric_instance = metric_class( + self.metric_reader, metric, level) + + except Exception as exc: + raise MetricSettingsException( + f"Collector setting error of metric {metric}: {exc}" + ) from exc + + self.registed_metric[level].append( + metric_instance) + + def metric_register(self): + """ + Register all metrics to metric manager from settings + """ + try: + self._metric_register(CLUSTER_METRICS, Level.Cluster) + self._metric_register(POD_METRICS, Level.Pod) + self._metric_register(NODE_METRICS, Level.Node) + except MetricSettingsException as exc: + raise exc diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py new file mode 100644 index 0000000000000000000000000000000000000000..5640b88b12687a6df6c71f64c53e384969c35928 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py @@ -0,0 +1,132 @@ +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + InsAggregationType +from app.collector.metric_exception import MetricCollectException, \ + MetricSettingsException +from conf.settings import NODE_LABEL, POD_LABEL,\ + CLUSTER_LABEL, POD_METRIC_TAG + + +class CapacityMetric(Metric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _initalize_score_settings(self, score_setting): + return super()._initalize_score_settings(score_setting) + + def _usage_total_process(self, is_avaliable: bool = False): + """ + process usage/total or 1 - avaliable/total type metric + """ + final_result = None + usage_query_args = {} + limit_query_args = {} + metric_name = self.settings.collect.metric_name + node_tag = self.settings.collect.node_tag_name + val_0 = self.settings.collect.related_value[0] + val_1 = self.settings.collect.related_value[1] + + if self.level == Level.Node: + usage_query_args = { + NODE_LABEL: self.name[Level.Node], + node_tag: val_0 + } + limit_query_args = { + NODE_LABEL: self.name[Level.Node], + node_tag: val_1 + } + elif self.level == Level.Pod: + usage_query_args = { + NODE_LABEL: self.name[Level.Node], + POD_LABEL: self.name[Level.Pod], + POD_METRIC_TAG: val_0 + } + limit_query_args = { + NODE_LABEL: self.name[Level.Node], + POD_LABEL: self.name[Level.Pod], + POD_METRIC_TAG: val_1 + } + else: + usage_query_args = { + CLUSTER_LABEL: self.name[Level.Cluster], + node_tag: val_0 + } + limit_query_args = { + CLUSTER_LABEL: self.name[Level.Cluster], + node_tag: val_1 + } + pass + + usage_res = self._get_custom_metric(metric_name, **usage_query_args) + limit_res = self._get_custom_metric(metric_name, **limit_query_args) + if len(usage_res.data) <= 0 or len(limit_res.data) <= 0 or \ + len(usage_res.data) != len(limit_res.data): + raise MetricCollectException( + f"Collect {metric_name}," + f"Level: {self.level} from Prometheus failed!" + ) + + try: + all_data_result = [] + usages = 0 + limits = 0 + for i in range(len(usage_res.data)): + usage_data = usage_res.data[i].to_dict()["values"] + usage_value = max([float(item[1]) for item in usage_data]) + + limit_data = limit_res.data[i].to_dict()["values"] + limit_value = max([float(item[1]) for item in limit_data]) + + if self.level == Level.Cluster: + usages += usage_value + limits += limit_value + else: + # 对于容器级别指标,每个容器都使用usage/limit * 100 算出单个容器使用率 + util_result = float(usage_value) / float(limit_value) * 100 + all_data_result.append(util_result) + + if self.level == Level.Cluster: + # 对于集群级别的指标,cluster_util = sum(节点usage) / sum(节点limit) + final_result = usages / limits * 100 + else: + # 对于节点级别的指标,all_data_result只有一个元素,即节点util + # 对于容器级别的指标,pod_utl = max(容器util) (因为pod中可能有容器有limit,有容器没有) + final_result = self._aggregation( + all_data_result, InsAggregationType.Max) + + except Exception as exc: + raise MetricCollectException() from exc + + if is_avaliable: + # usage = total - avaliable + final_result = 100 - final_result + + return final_result + + def _collect_process_metric(self): + """ + We offer three standard method to process capacity metric: + standard_type = 1: the metric is already util + standard_type = 2: the metric is total && used + standard_type = 3: the metric is total && avaliable + """ + + standard_type = self.settings.collect.standard_type + + if standard_type == 1: + return super()._default_single_gauge( + ins_agg_type=InsAggregationType.Sum + ) + # 对于吐上来是usage && total的指标处理 + elif standard_type == 2: + return self._usage_total_process(is_avaliable=False) + # 对于吐上来是avaliable && total的指标处理 + elif standard_type == 3: + return self._usage_total_process(is_avaliable=True) + else: + raise MetricSettingsException( + f'illegal standard type:{standard_type}' + ) + + \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_type/error.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/error.py new file mode 100644 index 0000000000000000000000000000000000000000..25d70d94f88ea0138f219a82d25c7dcd6b90e060 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/error.py @@ -0,0 +1,20 @@ +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + RangeAggregationType, InsAggregationType +from app.collector.metric_exception import MetricSettingsException + + +class ErrorMetric(Metric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self) -> float: + if self.settings.collect.standard_type != 1: + raise MetricSettingsException() + + return super()._default_single_counter( + related_value=self.settings.collect.related_value[0], + range_agg_type=RangeAggregationType.Increase, + ins_agg_type=InsAggregationType.Sum + ) diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py new file mode 100644 index 0000000000000000000000000000000000000000..437b95bfb50b70ceee603d502f76ec8b735812b8 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py @@ -0,0 +1,71 @@ +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + MetricSettingsException, RangeAggregationType, InsAggregationType + + +class LatencyMetric(Metric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _process_latency_time_counter( + self, + range_agg_type: RangeAggregationType = RangeAggregationType.Increase, + ins_agg_type: InsAggregationType = InsAggregationType.Sum + ): + three_weights = [0.1, 0.3, 0.6] + two_weights = [0.3, 0.7] + + # query all latency range + latency_range_result = [] + for range in self.settings.collect.related_value: + latency_range_result.append( + self._default_single_counter( + related_value=range, + range_agg_type=range_agg_type, + ins_agg_type=ins_agg_type + ) + ) + + final_value = None + if len(latency_range_result) >= 3: + partition = len(latency_range_result) // 3 + remainder = len(latency_range_result) % 3 + final_value = sum([ + weight * value + for weight, value in zip( + three_weights, + [ + sum(latency_range_result[:partition + remainder]), + sum(latency_range_result + [partition + remainder: partition * 2 + remainder] + ), + sum(latency_range_result[partition * 2 + remainder:]) + ] + ) + ]) + elif len(latency_range_result) == 2: + final_value = two_weights[0] * latency_range_result[0] + \ + two_weights[1] * latency_range_result[1] + else: + final_value = latency_range_result[0] + + return final_value + + def _collect_process_metric(self) -> float: + """ + We offer two standard method to process latency metric: + standard_type = 1: the metric is already latency + standard_type = 2: the metric is histogram + """ + + standard_type = self.settings.collect.standard_type + if standard_type == 1: + return super()._default_single_gauge() + elif standard_type == 2: + return self._process_latency_time_counter() + else: + raise MetricSettingsException( + f'illegal standard type:{standard_type}' + ) + diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_type/load.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/load.py new file mode 100644 index 0000000000000000000000000000000000000000..97d7cfa87edc6babeb1a25d3e3079d39d7e45522 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/load.py @@ -0,0 +1,34 @@ +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + RangeAggregationType, InsAggregationType +from app.collector.metric_exception import MetricSettingsException + + +class LoadMetric(Metric): + def __init__(self, metric_reader: MetricReader, metric_settings, + level: Level): + super().__init__(metric_reader, metric_settings, level) + + def _collect_process_metric(self) -> float: + """ + We offer two standard method to process load metric: + standard_type = 1: the metric is already load + standard_type = 2: the metric is counter + """ + + standard_type = self.settings.collect.standard_type + + if standard_type == 1: + return super()._default_single_gauge() + + elif standard_type == 2: + return super()._default_single_counter( + related_value=self.settings.collect.related_value[0], + range_agg_type=RangeAggregationType.Rate, + ins_agg_type=InsAggregationType.Max + ) + else: + raise MetricSettingsException( + f'illegal standard type:{standard_type}' + ) + diff --git a/sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py new file mode 100644 index 0000000000000000000000000000000000000000..4655f1b15faa3b2ea5c7f87d0b4a732f346600bb --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py @@ -0,0 +1,438 @@ +import sys +import uuid +import math +import conf.settings as settings +from clogger import logger +from enum import Enum +from time import time +from typing import List, Dict, Optional, Tuple +from multiprocessing import Queue +from queue import Full +from scipy.interpolate import interp1d +from sysom_utils import SysomFramework +from metric_reader import RangeQueryTask, InstantQueryTask, MetricReader +from dataclasses import dataclass +from lib.common_type import Level, Labels +from app.diagnose.diagnose_info import DiagnoseInfo +from app.collector.metric_exception import MetricSettingsException +from app.collector.metric_exception import MetricCollectException +from app.collector.metric_exception import MetricProcessException + +CLUSTER_LABEL = settings.CLUSTER_LABEL +POD_LABEL = settings.POD_LABEL +NODE_LABEL = settings.NODE_LABEL +POD_METRIC_TAG = settings.POD_METRIC_TAG + +class RangeAggregationType(Enum): + Increase = 0 + Rate = 1 + Irate = 2 + MAX_OVER_TIME = 3 + AVG_OVER_TIME = 4 + + +class InsAggregationType(Enum): + Sum = 0 + Max = 1 + Avg = 2 + + +@dataclass +class Collect: + metric_name: str + related_value: List[str] + standard_type: int + node_tag_name: Optional[str] = None + filename: Optional[str] = None + +@dataclass +class Alarm: + threshold: int + diagnose_type: str + diagnose_url: Optional[str] = None + service_name: Optional[str] = None + + +@dataclass +class MetricSettings: + metric_id: str + type: str + collect: Collect + score: Dict[str, int] + alarm: Optional[Alarm] + + +class Metric(): + def __init__(self, metric_reader: MetricReader, + metric_settings, level: Level): + self.metric_reader = metric_reader + self.level = level + self.name = {} # self.name[self.level] = cluster/node/pod name + self.last_end_time = 0 + self.score_interp = None + self.settings = None + self._initalize_settings(metric_settings) + + ########################################################################## + # Inner funtions + ########################################################################## + + def _initalize_settings(self, settings): + try: + self.settings = MetricSettings( + metric_id = settings["MetricID"], + type = settings["Type"], + collect = Collect(**settings["Collect"]), + score = settings["Score"], + alarm = Alarm(**settings["Alarm"]) if settings.get("Alarm") else None + ) + + if self.level == Level.Node: + if not self.settings.collect.node_tag_name: + raise MetricSettingsException( + f"node_tag_name must set " + f"in {self.settings.metric_id}!" + ) + + if self.settings.alarm is not None: + if self.settings.alarm.diagnose_type == "link": + if not self.settings.alarm.diagnose_url: + raise MetricSettingsException( + f"diagnose_url must set " + f"in {self.settings.metric_id}!" + ) + + if self.settings.alarm.diagnose_type == "custom": + if not self.settings.alarm.service_name: + raise MetricSettingsException( + f"service_name must set " + f"in {self.settings.metric_id}!" + ) + + self._initalize_score_settings(self.settings.score) + except Exception as exc: + raise MetricSettingsException() from exc + + def _initalize_score_settings(self, score_setting): + X = [] + Y = [] + + for score, metric_value in score_setting.items(): + Y.append(int(score)) + X.append(metric_value) + + # in early version of scipy, X[0] can't be 0 + if X[0] == 0: + X[0] = -sys.float_info.epsilon + + # 分数随指标值增加而下降,在头尾补上极端值 + X.insert(0, -sys.float_info.epsilon) + Y.insert(0, 100) + X.append(sys.maxsize) + Y.append(0) + self.score_interp = interp1d(X, Y) + + def _get_custom_metric(self, metric_name: str, **kwargs): + task = RangeQueryTask(metric_name, + start_time=self.last_end_time, + end_time=time()) + for key, value in kwargs.items(): + task.append_equal_filter(key, value) + + return self.metric_reader.range_query([task]) + + def _aggregation(self, data: List[float], + aggre: InsAggregationType) -> float: + if aggre == InsAggregationType.Sum: + return sum(data) + elif aggre == InsAggregationType.Max: + return max(data) + elif aggre == InsAggregationType.Avg: + return sum(data) / len(data) + + def _default_single_gauge( + self, + ins_agg_type: InsAggregationType = InsAggregationType.Max + ) -> float: + """Collect and process one gauge metric(max, max) + + 对于采集的指标是gauge类型: + value = max(range) + pod = max(containers) + cluster = max(nodes) + + final_value = max(max(query_result)) + """ + + query_args = {} + node_tag = self.settings.collect.node_tag_name + val = self.settings.collect.related_value[0] + + if self.level == Level.Node: + query_args = { + NODE_LABEL: self.name[Level.Node], + node_tag: val + } + elif self.level == Level.Pod: + query_args = { + NODE_LABEL: self.name[Level.Node], + POD_LABEL: self.name[Level.Pod], + POD_METRIC_TAG: val + } + else: + query_args = { + CLUSTER_LABEL: self.name[Level.Cluster], + node_tag: val + } + pass + + res = self._get_custom_metric( + self.settings.collect.metric_name, **query_args) + if len(res.data) <= 0: + raise MetricCollectException( + f"Collect {self.settings.collect.metric_name}, Level: " + f"{self.level} from Prometheus: no data!" + ) + # print(json.dumps(res.to_dict())) + try: + max_values = [] + # 对于容器指标:多个data表示是同一个pod的多个容器 + # 对于节点指标:应该只有一个data + # 对于集群指标:多个data表示集群中的多个节点 + for i in range(len(res.data)): + values = res.data[i].to_dict()["values"] + # 取区间向量所有点的的最大值 + max_value = max(float(value[1]) for value in values) + max_values.append(max_value) + final_value = self._aggregation(max_values, ins_agg_type) + except Exception as exc: + raise MetricProcessException() from exc + + return final_value + + def _default_single_counter( + self, + related_value: str, + range_agg_type: RangeAggregationType, + ins_agg_type: InsAggregationType = InsAggregationType.Sum + ) -> float: + """Collect and process one counter metric(increase/rate/irate, max) + """ + + metric_name = self.settings.collect.metric_name + node_tag = self.settings.collect.node_tag_name + + query_interval = int(time() - self.last_end_time) + if query_interval < 60: + query_interval = 60 + query_interval_str = f"{query_interval}s" + + aggr_str = range_agg_type.name.lower() + if self.level == Level.Pod: + task = InstantQueryTask(metric_name, + time=time(), aggregation=aggr_str, + interval=query_interval_str) \ + .append_equal_filter(NODE_LABEL, self.name[Level.Node]) \ + .append_equal_filter(POD_LABEL, self.name[Level.Pod]) \ + .append_equal_filter(POD_METRIC_TAG, related_value) + elif self.level == Level.Node: + task = InstantQueryTask(metric_name, + time=time(), aggregation=aggr_str, + interval=query_interval_str) \ + .append_equal_filter(NODE_LABEL, self.name[Level.Node]) \ + .append_equal_filter(node_tag, related_value) + else: + # cluster level + task = InstantQueryTask(metric_name, + time=time(), aggregation=aggr_str, + interval=query_interval_str) \ + .append_equal_filter(CLUSTER_LABEL, self.name[Level.Cluster]) \ + .append_equal_filter(node_tag, related_value) + + res = self.metric_reader.instant_query([task]) + if len(res.data) <= 0: + raise MetricCollectException( + f"Collect {metric_name}, Value: {related_value}," + f"Level: {self.level} from Prometheus failed: no data!" + ) + # logger.info(json.dumps(res.to_dict())) + final_value = None + try: + # 一般情况下,对于容器指标:data[0],data[1]..表示是同一个pod的多个容器 + # 区间向量已经通过promql的increase函数聚合,只需要将容器指标聚合成pod指标即可 + # 注意instant_query返回的值的key是"value" + values = [float(res.data[i].to_dict()["value"][1]) + for i in range(len(res.data))] + + final_value = self._aggregation(values, ins_agg_type) + + except Exception as exc: + raise MetricProcessException() from exc + + return final_value + + def _collect_process_metric(self) -> float: + """Collect metric and preprocess metrics from prometheus, + return a value to calculate score + + Returns: + float: metric value after collect and preprocess + """ + raise NotImplementedError("_collect_process_metric not implememted!") + + def _calculate_score(self, metric_value: float) -> float: + res = self.score_interp(metric_value) + # res is numpy.ndarry, convert to float + score = round(float(res.tolist()), 2) + return score + + ########################################################################## + # Outer funtions + ########################################################################## + def deliver_health_metric(self, metric_value: float, score: float): + """Deliver health metric to sysom health score calculator + + Args: + metric_value: metric value + score: score after calculation + """ + health_metric = { + "metric_id": self.settings.metric_id, + "process_time": time(), + "event_time": time(), + "score": score, + "value": metric_value, + "layer": self.level.value, + "cluster": self.name[Level.Cluster], + } + if self.level == Level.Node: + health_metric["node"] = self.name[Level.Node] + if self.level == Level.Pod: + health_metric["pod"] = self.name[Level.Pod] + health_metric["namespace"] = self.name['namespace'] + + SysomFramework.abnormal_metric(health_metric) + + def deliver_alarm(self, metric_value: float, type: str) -> str: + alarm_uuid = uuid.uuid4() + metric_value = round(metric_value, 2) + + SysomFramework.alarm({ + "alert_id": str(alarm_uuid), + "instance": self.name[self.level], + "alert_item": self.settings.metric_id, + "alert_category": "MONITOR", + "alert_source_type": "health check", + "alert_time": int(round(time() * 1000)), + "status": "FIRING", + "labels": { + "cluster": self.name[Level.Cluster], + "node": self.name[Level.Node], + "pod": self.name[Level.Pod], + "metric_type": type, + }, + "annotations": { + "summary": f"{self.settings.metric_id} has low score with" + f" value {metric_value}" + } + }) + + return str(alarm_uuid) + + def deliver_diagnose(self, alarm_id: str, level: Level, + type: str, queue: Queue): + """Deliver diagnose info to diagnose worker + + Args: + alarm_id: alarm id + level: level of this metric + type: metric type + queue: diagnose queue + """ + + diagnose_type = self.settings.alarm.diagnose_type + + if diagnose_type == "link": + SysomFramework.alarm_action("ADD_OPT", { + "alert_id": str(alarm_id), + "opt": { + "key": self.settings.metric_id, + "label": self.settings.metric_id, + "type": "LINK", + "url": self.settings.alarm.diagnose_url + } + }) + elif diagnose_type == "custom": + try: + queue.put( + DiagnoseInfo( + alarm_id=str(alarm_id), + service_name=self.settings.alarm.service_name, + type=type, + level=level, + metric_id=self.settings.metric_id, + instance=self.name[Level.Node], + ), + block=False + ) + except Full: + logger.error(f"Diagnose queue is full!, " + f"drop alarm {alarm_id}'s diagnose!") + except Exception as e: + logger.error(f"Deliver diagnose info of alarm {alarm_id} " + f"to diagnose worker failed: {e}") + pass + + def construct_diagnose_req( + self, diagnose_info: DiagnoseInfo) -> Dict[str, str]: + """Construct diagnose request to query sysom diagnosis center + + Args: + diagnose_info: diagnose info + + Returns: + Dict: query request + """ + raise NotImplementedError("construct_diagnose_req not implememted!") + + def process_diagnose_req(self, result): + """Process diagnose result from sysom diagnosis center + + Args: + result: diagnose result + """ + raise NotImplementedError("process_diagnose_req not implememted!") + + def metric_score(self, labels: Labels, last_end_time: float) -> Tuple[float, float]: + """Calculate the final score of this metric + + Args: + labels: cluster/node/pod labels + last_end_time: end time of last calculate interval + + Raises: + MetricCollectException + MetricSettingsException + NotImplementedError + + Returns: + (metric_value, score): metric value and score after calculation + """ + self.name[Level.Pod] = labels.pod + self.name['namespace'] = labels.namespace + self.name[Level.Node] = labels.instance + self.name[Level.Cluster] = labels.cluster + self.last_end_time = last_end_time + + metric_value = None + score = None + try: + metric_value = self._collect_process_metric() + score = self._calculate_score(metric_value) + score = math.floor(score) + except Exception as e: + err = f"Calculate metric: {self.settings.collect.metric_name} " + \ + f"score failed: {str(e)}!" + raise MetricProcessException(err) from e + + return metric_value, score diff --git a/sysom_server/sysom_cluster_health/app/consumer/consumer.py b/sysom_server/sysom_cluster_health/app/consumer/consumer.py new file mode 100644 index 0000000000000000000000000000000000000000..3484daafa7ab9c4fdec7b13121ae3264c666fb8f --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/consumer/consumer.py @@ -0,0 +1,74 @@ +from cec_base.event import Event +from cec_base.consumer import Consumer +from cec_base.producer import Producer +from cec_base.cec_client import MultiConsumer, CecAsyncConsumeTask +from clogger import logger +from lib.common_type import Level +from app.health_calculator.health_metric import HealthMetric +from conf.settings import * +from sysom_utils import CecTarget, SysomFramework + + +class HealthMetricListener(MultiConsumer): + def __init__(self) -> None: + super().__init__( + YAML_CONFIG.get_cec_url(CecTarget.PRODUCER), + custom_callback=self.on_receive_event, + ) + self.append_group_consume_task( + CEC_TOPIC_SYSOM_HEALTH_METRIC, + "sysom_cluster_health", + Consumer.generate_consumer_id(), + ensure_topic_exist=True, + ) + + self.gcache_cluster_metrics = SysomFramework.gcache(CLUSTER_HEALTH_METRIC_GCACHE) + self.gcache_node_metrics = SysomFramework.gcache(NODE_HEALTH_METRIC_GCACHE) + self.gcache_pod_metrics = SysomFramework.gcache(POD_HEALTH_METRIC_GCACHE) + + def _delivery(self, topic: str, value: dict): + self._producer.produce(topic, value) + self._producer.flush() + + def _deal_health_metric(self, health_metric: HealthMetric): + health_metric = health_metric.dict() + layer = health_metric["layer"] + + if layer not in [Level.Cluster.value, Level.Node.value, Level.Pod.value]: + raise Exception(f"Invalid layer: {layer} of metric: {health_metric}") + + try: + if layer == Level.Cluster.value: + cluster = health_metric[cluster] + self.gcache_cluster_metrics.push_list(cluster, health_metric) + elif layer == Level.Node.value: + # use cluster and node as key in case of same node name in different cluster + key = f"{health_metric['cluster']}:{health_metric['node']}" + self.gcache_node_metrics.push_list(key, health_metric) + elif layer == Level.Pod.value: + key = f"{health_metric['cluster']}:{health_metric['pod']}:{health_metric['namespace']}" + self.gcache_pod_metrics.push_list(key, health_metric) + except Exception as e: + raise Exception( + f"Failed to deal with health metric: {health_metric}, error: {e}" + ) + + def on_receive_event(self, event: Event, task: CecAsyncConsumeTask): + """ + 处理每个单独的任务 + """ + event_value = event.value + try: + assert isinstance(event_value, dict) + if task.topic_name == CEC_TOPIC_SYSOM_HEALTH_METRIC: + health_metric = HealthMetric(**event_value) + self._deal_health_metric(health_metric) + else: + logger.warning( + f"Received not expect topic data, topic = {task.topic_name}" + ) + except Exception as e: + logger.exception(e) + finally: + # 执行消息确认 + task.ack(event) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/crud.py b/sysom_server/sysom_cluster_health/app/crud.py new file mode 100644 index 0000000000000000000000000000000000000000..dff5cb83016a9fca1a68448366be17aebbd1dcb5 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/crud.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File crud.py +Description: +""" +import uuid +from clogger import logger +from typing import Optional, List +from sqlalchemy.orm import Session +from app import models, schemas, query + +################################################################################################ +# Define database crud here +################################################################################################ + +# def get_person_by_name(db: Session, name: str) -> Optional[models.Person]: +# return db.query(models.Person).filter(models.Person.name == name).first() + +# def create_person(db: Session, person: schemas.Person) -> models.Person: +# person = models.Person(**person.dict()) +# db.add(person) +# db.commit() +# db.refresh(person) +# return person + +# def del_person_by_id(db: Session, person_id: int): +# person = db.get(models.Person, person_id) +# db.delete(person) +# db.commit() + +# def get_person_list(db: Session, query_params: query.PersonQueryParams) -> List[models.Person]: +# return ( +# query_params.get_query_exp(db) +# .all() +# ) + + +def create_abnormal_metrics_data( + db: Session, abnormal_metrics: schemas.AbnormalMetricsBase +) -> Optional[models.BaseModel]: + base = { + "uuid": str(uuid.uuid4()), + "metric_id": abnormal_metrics.metric_id, + "metric_type": abnormal_metrics.metric_type, + "score": abnormal_metrics.score, + "value": abnormal_metrics.value, + "cluster": abnormal_metrics.cluster, + "timestamp": abnormal_metrics.timestamp + } + + if abnormal_metrics.instance == "": + db_abnormal_metrics = models.AbnormalMetricsCluster(**base) + elif abnormal_metrics.pod == "" and abnormal_metrics.instance != "": + db_abnormal_metrics = models.AbnormalMetricsNode( + instance=abnormal_metrics.instance, + **base + ) + elif abnormal_metrics.pod != "" and abnormal_metrics.namespace != "": + db_abnormal_metrics = models.AbnormalMetricsPod( + instance=abnormal_metrics.instance, + pod=abnormal_metrics.pod, + namespace=abnormal_metrics.namespace, + **base, + ) + else: + logger.error(f"Inserting Invalid abnormal_metrics " + f"to mysql: {abnormal_metrics}") + return None + + db.add(db_abnormal_metrics) + db.commit() + db.refresh(db_abnormal_metrics) + return db_abnormal_metrics + +def del_all_abnormal_metrics_data(db: Session): + db.query(models.AbnormalMetricsCluster).delete() + db.query(models.AbnormalMetricsNode).delete() + db.query(models.AbnormalMetricsPod).delete() + db.commit() + return \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/database.py b/sysom_server/sysom_cluster_health/app/database.py new file mode 100644 index 0000000000000000000000000000000000000000..13b4707c803e9113578e6e0d1a32d6a2a277faf2 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/database.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File database.py +Description: +""" +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from conf.settings import SQLALCHEMY_DATABASE_URL +from sysom_utils import FastApiResponseHelper + +engine = create_engine( + SQLALCHEMY_DATABASE_URL, connect_args={} +) + +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + +Base = declarative_base() + +FastApiResponseHelper.bind_base_class(Base) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py new file mode 100644 index 0000000000000000000000000000000000000000..deef033b69e4357439eaa359cd2acc588e90e10a --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import Optional, Dict, List +from lib.common_type import Level + +@dataclass +class DiagnoseInfo: + alarm_id: str + service_name: str + type: str + level: Level + metric_id: str + instance: str + pod: Optional[str] = None + container: Optional[str] = None + time: Optional[str] = None + diagnose_type: Optional[str] = None \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..04f5c9ad1423504973bba0d0dede9abe68e0cc17 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py @@ -0,0 +1,124 @@ +import time +import os +from multiprocessing import Process +from typing import Optional +from clogger import logger +from conf.settings import YAML_CONFIG +from datetime import datetime +from sysom_utils import SysomFramework, GClient +from app.diagnose.diagnose_info import DiagnoseInfo +from app.collector.metric_type.metric_type import Metric +from app.collector.metric_manager import MetricManager + +SYSOM_POLL_TIMEOUT = 20 +SYSOM_POLL_INTERVAL = 1 +SLEEP_INTERVAL = 1 + + +class DiagnoseWorker(Process): + + g_client: Optional[GClient] = None + + def __init__(self, metric_manager: MetricManager, queue, parent_pid): + super().__init__(daemon=True) + self.metric_manager = metric_manager + self.queue = queue + self.parent_pid = parent_pid; + + @classmethod + def get_gclient(cls): + if cls.g_client is None: + SysomFramework.init(YAML_CONFIG) + cls.g_client = SysomFramework.gclient("sysom_diagnosis") + return cls.g_client + + def check_if_parent_is_alive(self): + try: + os.kill(self.parent_pid, 0) + except OSError: + logger.info(f"Diagnose worker's parent {self.parent_pid} is exit") + exit(0) + + # use metric id to get metric instance + def _search_metric(self, diagnose_info: DiagnoseInfo) -> Metric: + try: + level = diagnose_info.level + metric_id = diagnose_info.metric_id + + metric_list = self.metric_manager.registed_metric[level] + for metric in metric_list: + if metric.settings.metric_id == metric_id: + return metric + except Exception as e: + logger.exception(e) + return None + return None + + # todo: use async function + def send_diagnosis(self, diagnose_info: DiagnoseInfo): + metric = self._search_metric(diagnose_info) + if metric is None: + logger.warning(f"Diagnose:Can not find metric" + f"instance of {diagnose_info.metric_id}") + return + + try: + diag_input = metric.construct_diagnose_req(diagnose_info) + retdiag = DiagnoseWorker.get_gclient() \ + .post("api/v1/tasks/", json=diag_input) + + retdiag_dict = retdiag.json() + if retdiag_dict["success"] is True: + taskid = retdiag_dict["data"]["task_id"] + logger.info(f"Send diagnosis success: {taskid}") + + start_time = datetime.now() + end_time = datetime.now() + + while True: + if (end_time - start_time).seconds >= SYSOM_POLL_TIMEOUT: + break + retdict_t = get_diagnose_result(taskid) + state = retdict_t["data"]["status"] + if state == "Success": + logger.info("Get diagnosis result success!") + metric.process_diagnose_req(diagnose_info, + retdict_t["data"]["result"] + ) + break + if state == "Fail": + logger.info("Get diagnosis result failed!") + break + time.sleep(SYSOM_POLL_INTERVAL) + end_time = datetime.now() + except Exception as e: + logger.error(f"Get diagnosis of alarm " + f"{diagnose_info.alarm_id} failed: {e}") + + def run(self): + logger.info(f'告警诊断下发守护进程PID: {os.getpid()}') + + while True: + try: + self.check_if_parent_is_alive() + + if self.queue.empty(): + time.sleep(SLEEP_INTERVAL) + continue + + diagnose_info = self.queue.get() + self.send_diagnosis(diagnose_info) + + except Exception as e: + logger.exception(e) + + +def get_diagnose_result(taskid): + retdict = {"data": {"status": "Failed"}} + try: + retdiag = DiagnoseWorker.get_gclient().get("api/v1/tasks/%s/" % taskid) + retdict = retdiag.json() + except BaseException: + logger.info("get_diagnose_result exception!") + pass + return retdict diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py b/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..99267119dc0f08ba37554b4f35cd18fb489d1820 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py @@ -0,0 +1,292 @@ +import json +import conf.settings as settings +from abc import abstractmethod +from typing import List, Dict, Tuple +from clogger import logger +from lib.common_type import Level +from app.health_calculator.health_metric import HealthMetric, HealthMetricsMap + +METRIC_TYPES = ["capacity", "load", "latency", "error"] + +class HealthAlgorithm: + def __init__(self, level: Level): + self.level = level + self.registed_metric = {} # metric_id -> metric_setting map + self.type_metrics = { + metric_type: [] for metric_type in METRIC_TYPES + } # type -> [metric_setting] map + self.output_abnormal_metrics = { + metric_type: HealthMetricsMap() for metric_type in METRIC_TYPES + } + + @abstractmethod + def preprocessing(self, metrics: Dict[str, HealthMetric]): + """ + Preprocess the data + + args: + metrics: the metrics data receive from gcache metric set + metrics = { + "metric_id1": HealthMetric + "metric_id2": HealthMetric + ... + } + + """ + raise NotImplementedError("Subclass must implement abstract method") + + @abstractmethod + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + """ + The method to calculate the health of this level + + return: (capacity_score, load_score, latency_score, error_score, instance_score) + """ + raise NotImplementedError("Subclass must implement abstract method") + + + def calculate_lower_level(self, values: List[float]) -> float: + """ + The method to calculate the lower level health to this level + + args: + values: the health score of lower level instances + assume we are calculating a node's score, the values = + [pod1_score, pod2_score, pod3_score3] + + return: the health score of this instance(cluster, node, pod) + """ + if len(values) <= 0: + return 100 + + bad_instances = [] + for value in values: + if value < 0 or value > 100: + raise ValueError(f"Score: {value} is invalid") + if value < 60: + bad_instances.append(value) + + # no bad instances + if len(bad_instances) <= 0: + return sum(values) / len(values) + + if len(bad_instances) == len(values): + return 0 + + bad_instances_ratio = len(bad_instances) / len(values) + # if the ratio of bad instances is more than 10%, the health score will be under 60 + if bad_instances_ratio >= 0.1: + return 60 * (1 - (len(bad_instances) / len(values) - 0.1)) + if bad_instances_ratio < 0.1: + return 90 - 30 * (len(bad_instances) / (len(values) * 0.1)) + + return 100 + + + def register_metric_from_settings(self): + metrics_mapping = { + Level.Cluster: settings.CLUSTER_HEALTH_METRICS, + Level.Node: settings.NODE_HEALTH_METRICS, + Level.Pod: settings.POD_HEALTH_METRICS + } + + metrics = metrics_mapping.get(self.level) + if metrics is not None: + for metric in metrics: + if "MetricID" not in metric or "Type" not in metric: + logger.warning(f"Setting: metric {json.dumps(metric)}" + f"is invalid, skip it") + continue + + metric_id = metric["MetricID"] + metric_type = metric["Type"] + + if metric_type not in METRIC_TYPES: + logger.warning(f"Setting: metric {metric_id}" + f"type {metric_type} is invalid, skip it") + continue + + self.registed_metric[metric_id] = metric + self.type_metrics[metric_type].append(metric) + + + def get_abnormal_metrics(self) -> Dict[str, HealthMetricsMap]: + return self.output_abnormal_metrics + + +class DefaultHealthAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + categories = ["critical_abnormal", "suspect_abnormal", "trend_abnormal"] + types = METRIC_TYPES + ["instance"] + score_ranges = [(0, 60), (60, 90), (90, 100)] + + self.data = {type: {category: [] for category in categories} for type in types} + + def __add_metric_to_data(metric, category): + self.data[metric_type][category].append(metric) + self.data["instance"][category].append(metric) + self.output_abnormal_metrics[metric_type].add_metric(metric) + + for metric in metrics.values(): + if metric.metric_id not in self.registed_metric: + logger.warning(f"Receive metric {metric.metric_id} not in registed metrics") + continue + + metric_type = self.registed_metric[metric.metric_id]["Type"] + + score = metric.score + if score < 0 or score > 100: + logger.warning(f"Metric {metric.metric_id} score {score} is invalid") + continue + + for i, (low, high) in enumerate(score_ranges): + if low <= score < high: + __add_metric_to_data(metric, categories[i]) + break + + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + def _calculate_score(abnormal_dict, metric_num) -> float: + if metric_num <= 0: + return 100 + + if len(abnormal_dict["critical_abnormal"]) > 0: + return 60 * (1 - len(abnormal_dict["critical_abnormal"]) / metric_num) + + if len(abnormal_dict["suspect_abnormal"]) > 0: + return 90 - 30 * (len(abnormal_dict["suspect_abnormal"]) / metric_num) + + if len(abnormal_dict["trend_abnormal"]) > 0: + return 100 - 10 * (len(abnormal_dict["trend_abnormal"]) / metric_num) + + return 100 + + res = [] + for type in METRIC_TYPES: + type_registed_metrics = self.type_metrics.get(type, {}) + res.append(_calculate_score(self.data[type], len(type_registed_metrics))) + + res.append(_calculate_score(self.data["instance"], len(self.registed_metric))) + return tuple(res) + + +class WeightedSumAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + + def register_metric_from_settings(self): + def __check_weight(metrics): + for type, metrics in metrics.items(): + if len(metrics) <= 0: + continue + + weight = 0 + for metric in metrics: + weight += metric["Weight"] + if weight != 1: + raise Exception(f"Sum of weight of {type} metrics of " + f"level {self.level} is not equal 1") + + super().register_metric_from_settings() + __check_weight(self.type_metrics) + + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + self.data = { + metric_type: {} for metric_type in METRIC_TYPES + } + for metric in metrics.values(): + if metric.metric_id not in self.registed_metric: + logger.warning(f"Receive metric {metric.metric_id} not in registed metrics") + continue + + metric_type = self.registed_metric[metric.metric_id]["Type"] + self.data[metric_type][metric.metric_id] = metric + + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + """ + Calculate the health score of this level using weighted sum algorithm: + type_score = sum(metric_score * metric_weight) + instance_score = avg(type_score) + """ + + res = [] + for type, type_metrics in self.type_metrics.items(): + if len(type_metrics) <= 0: + res.append(100) + continue + + type_score = 0 + if type not in self.data: + logger.warning("WeightedSumAlgorithm: Type {type} not in receive data") + type_score = 100 + res.append(type_score) + continue + + for metric in type_metrics: + metric_score = 100 + metric_id = metric["MetricID"] + if metric_id not in self.data[type]: + logger.warning(f"WeightedSumAlgorithm: Metric {metric} not in data") + else: + metric_score = self.data[type][metric_id].score + + if metric_score < 100: + self.output_abnormal_metrics[type].add_metric(self.data[type][metric_id]) + + type_score += metric_score * metric["Weight"] + + res.append(type_score) + + instance_score = sum(res) / len(res) + res.append(instance_score) + print("WeightedSumAlgorithm: calculated score: ", res) + return tuple(res) + + +class EwmAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + pass + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + pass + +class CriticAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + pass + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + pass + +def choose_algorithm(alg_setting: str, level: Level) -> HealthAlgorithm: + algorithm_mapping = { + "default": DefaultHealthAlgorithm, + "weightedSum": WeightedSumAlgorithm, + "ewm": EwmAlgorithm, + "critic": CriticAlgorithm + } + + algorithm_class = algorithm_mapping.get(alg_setting, DefaultHealthAlgorithm) + algorithm_instance = algorithm_class(level) + + try: + algorithm_instance.register_metric_from_settings() + except Exception as e: + logger.error(f"Algorithm {alg_setting} init failed: {e}") + raise e + + logger.info(f"选择算法: {algorithm_class.__name__},层级: {level}") + + return algorithm_instance + diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py b/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..16ef14a828c92d99dba812c2d61e1dfc97130a93 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py @@ -0,0 +1,129 @@ +import time +import copy +from typing import Dict, Optional +from conf.settings import * +from multiprocessing import Process +from schedule import Scheduler +from os import getpid, kill +from clogger import logger +from sysom_utils import SysomFramework, GCache +from lib.common_type import Level +from app.health_calculator.instance import construct_cluster_infos +from app.health_calculator.algorithm.health_algorithm import choose_algorithm, HealthAlgorithm +from app.crud import del_all_abnormal_metrics_data +from app.database import SessionLocal +from metric_reader import dispatch_metric_reader + + +class HealthCalculator(Process): + gcache: Optional[Dict[Level, GCache]] = None + algorithms: Optional[Dict[Level, HealthAlgorithm]] = None + + def __init__( + self, + parent_pid: int = None + ) -> None: + super().__init__(daemon=True) + self.clusterhealth_interval = CALCULATE_INTERVAL + self.clusterhealth_host_schedule: Scheduler = Scheduler() + self.parent_pid = parent_pid + self.cluster_infos = {} + self.metric_reader = dispatch_metric_reader( + "prometheus://" + PROMETHEUS_CONFIG.host + ":" + str(PROMETHEUS_CONFIG.port)) + + @classmethod + def get_gcache(cls) -> Dict[Level, GCache]: + if cls.gcache is None: + cls.gcache = { + Level.Cluster: SysomFramework.gcache(CLUSTER_HEALTH_METRIC_GCACHE), + Level.Node: SysomFramework.gcache(NODE_HEALTH_METRIC_GCACHE), + Level.Pod: SysomFramework.gcache(POD_HEALTH_METRIC_GCACHE) + } + return cls.gcache + + @classmethod + def get_algorithms(cls) -> Dict[Level, HealthAlgorithm]: + if cls.algorithms is None: + cls.algorithms = { + Level.Cluster: choose_algorithm(CLUSTER_ALGORITHM, Level.Cluster), + Level.Node: choose_algorithm(NODE_ALGORITHM, Level.Node), + Level.Pod: choose_algorithm(POD_ALGORITHM, Level.Pod) + } + return cls.algorithms + + def _check_if_parent_is_alive(self): + try: + kill(self.parent_pid, 0) + except OSError: + logger.info(f"Analyzer's parent {self.parent_pid} is exit") + exit(0) + + def _calculate_health(self): + for cluster in self.cluster_infos.values(): + cluster.collect_metrics( + HealthCalculator().get_gcache()[Level.Cluster] + ) + for node in cluster.nodes.values(): + node.collect_metrics( + HealthCalculator().get_gcache()[Level.Node] + ) + for pod in node.pods.values(): + # collect metrics from gcache + pod.collect_metrics( + HealthCalculator().get_gcache()[Level.Pod] + ) + # calculate pod health score + pod.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Pod] + )) + # calculate node health score + node.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Node] + )) + # calculate cluster health score + cluster.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Cluster] + )) + + def calculating_task(self): + start_time = time.time() + + # cleanup abnormal metrics of last round data from mysql + if ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + del_all_abnormal_metrics_data(db) + + try: + self.cluster_infos = construct_cluster_infos(self.metric_reader, + self.clusterhealth_interval) + except Exception as e: + logger.error(f"Failed to construct cluster infos: {e}") + return + + self._calculate_health() + + # cleanup metric set from gcache + HealthCalculator().get_gcache()[Level.Cluster].clean() + HealthCalculator().get_gcache()[Level.Node].clean() + HealthCalculator().get_gcache()[Level.Pod].clean() + + self.last_end_time = time.time() + end_time = time.time() + logger.info(f"Excutaion time: {end_time - start_time}") + + + def run(self) -> None: + logger.info(f'健康度计算守护进程PID: {getpid()}') + + self.calculating_task() + self.clusterhealth_host_schedule.every(self.clusterhealth_interval)\ + .seconds.do(self.calculating_task) + + while True: + self._check_if_parent_is_alive(); + + if self.is_alive(): + self.clusterhealth_host_schedule.run_pending() + else: + break + time.sleep(max(1, int(self.clusterhealth_interval / 2))) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py b/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..72cefe343f3b015c22bcedcee200391e5cdfae96 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel +from collections import OrderedDict +from typing import Optional, Dict, List + +class HealthMetric(BaseModel): + metric_id: str + process_time: float + event_time: float + score: float + value: float + layer: str + cluster: str + node: Optional[str] + pod: Optional[str] + namespace: Optional[str] + +class HealthMetricsMap: + def __init__(self, capacity: int = 5): + self.capacity = capacity + self.metrics_map = OrderedDict() + + def add_metric(self, metric: HealthMetric): + if metric.metric_id in self.metrics_map: + self.metrics_map[metric.metric_id] = metric + else: + if len(self.metrics_map) >= self.capacity: + self.metrics_map.popitem(last=False) + self.metrics_map[metric.metric_id] = metric + self.metrics_map = OrderedDict( + sorted(self.metrics_map.items(), key=lambda item: item[1].score, reverse=True) + ) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/instance.py b/sysom_server/sysom_cluster_health/app/health_calculator/instance.py new file mode 100644 index 0000000000000000000000000000000000000000..3c49172cf7af4c209c823f344852eb6f1118a5b9 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/instance.py @@ -0,0 +1,305 @@ +import json +import time +from conf.settings import * +from typing import Dict, List +from datetime import datetime +from clogger import logger +from sysom_utils import SysomFramework +from lib.utils import collect_all_clusters, \ + collect_instances_of_cluster, collect_pods_of_instance +from app.health_calculator.health_metric import HealthMetric, HealthMetricsMap +from app.health_calculator.score_result import ScoreResult, ScoreType +from app.schemas import AbnormalMetricsBase +from app.crud import create_abnormal_metrics_data +from app.database import SessionLocal +from abc import ABC, abstractmethod + +gcache_cluster_exporter = SysomFramework.gcache(CLUSTER_METRIC_EXPORTER) +gcache_node_exporter = SysomFramework.gcache(NODE_METRIC_EXPORTER) +gcache_pod_exporter = SysomFramework.gcache(POD_METRIC_EXPORTER) + +class Instance(ABC): + def __init__(self, name: str): + self.name = name + self.metrics = {} # type: Dict[str, HealthMetric] + self.type_score = {"capacity": 100, "load": 100, "latency": 100, "error": 100} + self.score = 100 + + + def _push_score_result(self): + """ + Insert the health score result to gcache + + args: + result: the health score result + """ + raise NotImplementedError("Must implement _push_score_result") + + def _insert_score_result( + self, + level_labels: Dict[str, str], + abnormal_metrics: Dict[str, HealthMetricsMap] + ) -> List[dict]: + score_result = [] + for type, metrics_map in abnormal_metrics.items(): + # store abnormal metrics to as prometheus metrics + if ABNORMAL_METRIC_STORAGE == "prometheus": + for abnormal_metric in metrics_map.metrics_map.values(): + labels = level_labels.copy() + labels["description"] = abnormal_metric.metric_id + labels["type"] = type + score_result.append( + ScoreResult( + labels, abnormal_metric.score, + abnormal_metric.value, ScoreType.MetricScore + ).to_dict() + ) + # store abnormal metrics to as mysql data + elif ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + for metric_id, abnormal_metric in metrics_map.metrics_map.items(): + abnormal_metric_data = AbnormalMetricsBase( + metric_id=metric_id, + metric_type=type, + cluster=level_labels["cluster"], + instance=level_labels.get("instance", ""), + namespace=level_labels.get("namespace", ""), + pod=level_labels.get("pod", ""), + score=abnormal_metric.score, + value=abnormal_metric.value, + timestamp=time.time() + ) + create_abnormal_metrics_data(db, abnormal_metric_data) + + for type, score in self.type_score.items(): + labels = level_labels.copy() + labels["type"] = type + score_result.append( + ScoreResult( + labels, score, 0, ScoreType.MetricTypeScore + ).to_dict() + ) + + score_result.append( + ScoreResult( + level_labels, self.score, 0, ScoreType.InstanceScore + ).to_dict() + ) + + return score_result + + def _validate_metric_time(self, metric: HealthMetric): + now = time.time() + if (now - metric.event_time) > 2 * CALCULATE_INTERVAL: + event_time = datetime.fromtimestamp(metric.event_time) + now_datetime = datetime.fromtimestamp(now) + logger.warning(f"Metric {metric.metric_id} is too old, " + f"event_time: {event_time} now: {now_datetime}") + return False + + return True + + def _add_metric(self, metric: HealthMetric): + if not self._validate_metric_time(metric): + return + + metric_id = metric.metric_id + # multiple metrics with the same metric_id, use the worst one + if metric_id in self.metrics: + if metric.score >= self.metrics[metric_id].score: + return + + self.metrics[metric.metric_id] = metric + + def _collect_metrics_from_gcacge(self, key, gcache): + metrics_list = gcache.get_list(key) + for metric_data in metrics_list: + health_metric = HealthMetric(**metric_data) + self._add_metric(health_metric) + + def _lower_level_instances_score(self) -> List[float]: + return [] + + def calculate_health(self, algorithm): + try: + algorithm.preprocessing(self.metrics) + + ( + self.type_score["capacity"], + self.type_score["load"], + self.type_score["latency"], + self.type_score["error"], + this_level_score + ) = algorithm.calculate_this_level() + + lower_instances_score = self._lower_level_instances_score() + lower_level_score = algorithm.calculate_lower_level( + lower_instances_score + ) + + abnormal_metrics = algorithm.get_abnormal_metrics() + except Exception as e: + logger.error(f"Calculate {self.name} health failed: {e}") + # set score to -1 to indicate the health score is invalid + self.score = -1 + return + + self.score = min(lower_level_score, this_level_score) + self._push_score_result(abnormal_metrics) + + @abstractmethod + def collect_metrics(self, gcache): + pass + + +# Container Level is not implemented yet +class Container(Instance): + def __init__(self, name: str, pod: Instance): + self.pod = pod + super().__init__(name) + +class Pod(Instance): + def __init__(self, name: str, namespace: str, node: Instance): + self.node = node + self.namespace = namespace + self.containers = {} + super().__init__(name) + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.node.cluster.name, + "instance": self.node.name, + "pod": self.name, + "namespace": self.namespace, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_pod_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.node.cluster.name + ":" + self.name + ":" + self.namespace + self._collect_metrics_from_gcacge(key, gcache) + + def _lower_level_instances_score(self) -> List[float]: + return super()._lower_level_instances_score() + + def add_container(self, container: Container): + self.containers[container.name] = container + + +class Node(Instance): + def __init__(self, name: str, cluster: Instance): + self.pods = {} + self.cluster = cluster + super().__init__(name) + + def add_pod(self, pod: Pod): + self.pods[pod.name] = pod + + def find_pod(self, pod_name: str) -> Pod: + return self.pods[pod_name] + + def _lower_level_instances_score(self) -> List[float]: + return [ + pod.score + for pod in self.pods.values() + if 0 <= pod.score <= 100 + ] + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.cluster.name, + "instance": self.name, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_node_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.cluster.name + ":" + self.name + self._collect_metrics_from_gcacge(key, gcache) + + +class Cluster(Instance): + def __init__(self, name: str): + self.nodes = {} + super().__init__(name) + + def add_node(self, node: Node): + self.nodes[node.name] = node + + def find_node(self, node_name: str) -> Node: + return self.nodes[node_name] + + def _lower_level_instances_score(self) -> List[float]: + return [ + node.score + for node in self.nodes.values() + if 0 <= node.score <= 100 + ] + + def _lower_level_type_score(self, type: str) -> List[float]: + return [ + node.type_score[type] + for node in self.nodes.values() + if 0 <= node.type_score[type] <= 100 + ] + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.name, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_cluster_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.name + self._collect_metrics_from_gcacge(key, gcache) + + def calculate_health(self, algorithm): + try: + for type in self.type_score.keys(): + self.type_score[type] = algorithm.calculate_lower_level( + self._lower_level_type_score(type) + ) + + self.score = algorithm.calculate_lower_level( + self._lower_level_instances_score() + ) + except Exception as e: + logger.error(f"Calculate {self.name} health failed: {e}") + # set score to -1 to indicate the health score is invalid + self.score = -1 + return + + self._push_score_result({}) + + +def construct_cluster_infos(metric_reader, interval) -> Dict[str, Cluster]: + """ + Construct cluster infos from prometheus metrics + """ + res = {} + clusters = collect_all_clusters(metric_reader) + if len(clusters) == 0 or NO_CLUSTER_LABEL is True: + clusters.append("default") + + for cluster in clusters: + cluster_instance = Cluster(cluster) + nodes = collect_instances_of_cluster(cluster, + metric_reader, interval) + for node in nodes: + node_instance = Node(node, cluster_instance) + pods = collect_pods_of_instance(node, + metric_reader, interval) + for pod, ns in pods: + pod_instance = Pod(pod, ns, node_instance) + node_instance.add_pod(pod_instance) + cluster_instance.add_node(node_instance) + + res[cluster] = cluster_instance + + return res + + \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/score_result.py b/sysom_server/sysom_cluster_health/app/health_calculator/score_result.py new file mode 100644 index 0000000000000000000000000000000000000000..13a891c5d260dc939299cd3b64f85abd35753868 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/score_result.py @@ -0,0 +1,43 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Dict, NewType, List + +class ScoreType(Enum): + MetricScore = 1 + InstanceScore = 2 + MetricTypeScore = 3 + + +class ScoreResult: + def __init__(self, labels: Dict[str, str], score: float, + value: float, type: ScoreType): + self.labels = labels + self.score = score + self.value = value + self.type = type + + def keys(self): + return ('labels', 'score', 'value', 'type') + + def __getitem__(self, item): + return getattr(self, item) + + def to_dict(self): + result_dict = {} + for key in self.keys(): + value = self[key] + if isinstance(value, ScoreType): + result_dict[key] = value.value + else: + result_dict[key] = value + return result_dict + + +# [metric_type: [metrics'score result]] +TypeResult = NewType('TypeResult', Dict[str, List[ScoreResult]]) + + +@dataclass +class LevelResults: + labels: Dict[str, str] + results: TypeResult \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/models.py b/sysom_server/sysom_cluster_health/app/models.py new file mode 100644 index 0000000000000000000000000000000000000000..04160af0bdd7a62b738821bd21579f08bdfd0d4d --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/models.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File models.py +Description: +""" +from sqlalchemy import Column, Integer, String, DateTime, Float +from sqlalchemy.sql import func +from app.database import Base + + +########################################################################### +# Define databse model here +########################################################################### + +class BaseModel: + uuid = Column(String(128), primary_key=True, unique=True) + metric_id = Column(String(256)) + metric_type = Column(String(128)) + score = Column(Float) + value = Column(Float) + timestamp = Column(Float, default=func.time()) + +class AbnormalMetricsCluster(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_cluster" + + cluster = Column(String(256)) + +class AbnormalMetricsNode(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_node" + + cluster = Column(String(256)) + instance = Column(String(256)) + +class AbnormalMetricsPod(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_pod" + + cluster = Column(String(256)) + instance = Column(String(256)) + pod = Column(String(256)) + namespace = Column(String(256)) + diff --git a/sysom_server/sysom_cluster_health/app/query.py b/sysom_server/sysom_cluster_health/app/query.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbcd0898fc8d3f2be8d64ce694d7aa99ccb2332 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/query.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/09/19 15:41 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from typing import Optional +from app import models +from sysom_utils import BaseQueryParams + + +# class PersonQueryParams(BaseQueryParams): + +# # 1. 指定要查询的模型 +# __modelclass__ = models.Person + +# # 2. 定义排序字段 +# sort: str = "-created_at" + +# # 3. 定义支持用于过滤的参数 +# name: Optional[str] = None +# age: Optional[str] = None + +# # 4. 指定哪些字段是枚举类型,并且指明对应的枚举类 +# __enum_fields__ = { +# } \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/routers/health.py b/sysom_server/sysom_cluster_health/app/routers/health.py new file mode 100644 index 0000000000000000000000000000000000000000..b6c41cae924ca6b63eedeaaafbe5cea84ea91424 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/routers/health.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from fastapi import APIRouter + + +router = APIRouter() + + +@router.get("/check") +async def get_channel_config(): + return { + "code": 0, + "err_msg": "", + "data": "" + } diff --git a/sysom_server/sysom_cluster_health/app/routers/metrics.py b/sysom_server/sysom_cluster_health/app/routers/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..fe28918989ee27e5c9291299aa56116f7e78b7ca --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/routers/metrics.py @@ -0,0 +1,122 @@ +import json +from conf.settings import * +from prometheus_client import Gauge, CollectorRegistry, generate_latest +from fastapi import APIRouter, HTTPException, Response +from app.health_calculator.score_result import ScoreType +from sysom_utils import SysomFramework +from clogger import logger + +CLUSTER_HEALTH_SCORE_LABEL = ["cluster", "type"] +CLUSTER_HEALTH_METRIC_LABEL = ["cluster", "type", "description", "mode"] +NODE_HEALTH_SCORE_LABEL = ["cluster", "instance", "type"] +NODE_HEALTH_METRIC_LABEL = [ + "cluster", + "instance", + "type", + "description", + "mode"] +POD_HEALTH_SCORE_LABEL = ["cluster", "instance", "pod", "namespace", "type"] +POD_HEALTH_METRIC_LABEL = [ + "cluster", + "instance", + "type", + "pod", + "namespace", + "description", + "mode"] + + +registry = CollectorRegistry() +cluster_health_score = Gauge('sysom_cluster_health_score', + 'sysom cluster health score', + CLUSTER_HEALTH_SCORE_LABEL, + registry=registry) +cluster_health_metric = Gauge('sysom_cluster_health_metric', + 'sysom cluster health metric', + CLUSTER_HEALTH_METRIC_LABEL, + registry=registry) +node_health_score = Gauge('sysom_node_health_score', + 'sysom node health score', + NODE_HEALTH_SCORE_LABEL, + registry=registry) +node_health_metric = Gauge('sysom_node_health_metric', + 'sysom node health metric', + NODE_HEALTH_METRIC_LABEL, + registry=registry) +pod_health_score = Gauge('sysom_pod_health_score', + 'sysom pod health score', + POD_HEALTH_SCORE_LABEL, + registry=registry) +pod_health_metric = Gauge('sysom_pod_health_metric', + 'sysom pod health score', + POD_HEALTH_METRIC_LABEL, + registry=registry) + + +router = APIRouter() + +@router.get("/metrics") +def get_metrics(): + # pull health score metric from redis and push to prometheus + g_cache_cluster = SysomFramework.gcache(CLUSTER_METRIC_EXPORTER) + g_cache_instance = SysomFramework.gcache(NODE_METRIC_EXPORTER) + g_cache_pod = SysomFramework.gcache(POD_METRIC_EXPORTER) + + try: + cluster_all = g_cache_cluster.load_all() + nodes_all = g_cache_instance.load_all() + pods_all = g_cache_pod.load_all() + + if len(cluster_all) <= 0 or len(nodes_all) <= 0: + return Response(generate_latest(registry), media_type="text/plain") + + def process_metrics(metrics_all, score_labels, metric_labels, + health_score, health_metric, cache): + for item, results in metrics_all.items(): + metrics = json.loads(results) + # the last element is the health score + for metric in metrics: + if metric["type"] == ScoreType.MetricScore.value: + labels = [metric["labels"][label] + for label in metric_labels[:-1]] + ["score"] + # return score of each metric + health_metric.labels(*labels).set(metric["score"]) + # return value of each metric + labels[-1] = "value" + health_metric.labels(*labels).set(metric["value"]) + elif metric["type"] == ScoreType.MetricTypeScore.value: + labels = [metric["labels"][label] + for label in score_labels] + # return score of each metric + health_score.labels(*labels).set(metric["score"]) + elif metric["type"] == ScoreType.InstanceScore.value: + labels = [metric["labels"][label] + for label in score_labels[:-1]] + ["total"] + # return score of each metric + health_score.labels(*labels).set(metric["score"]) + + # delete metrics from redis + cache.delete(item) + + process_metrics(cluster_all, CLUSTER_HEALTH_SCORE_LABEL, + CLUSTER_HEALTH_METRIC_LABEL, + cluster_health_score, + cluster_health_metric, g_cache_cluster) + process_metrics(nodes_all, NODE_HEALTH_SCORE_LABEL, + NODE_HEALTH_METRIC_LABEL, + node_health_score, + node_health_metric, g_cache_instance) + process_metrics(pods_all, POD_HEALTH_SCORE_LABEL, + POD_HEALTH_METRIC_LABEL, + pod_health_score, + pod_health_metric, g_cache_pod) + + except Exception as e: + logger.error("Exception: ", e) + raise HTTPException(status_code=400, detail=str(e)) + finally: + g_cache_cluster.clean() + g_cache_instance.clean() + g_cache_pod.clean() + + return Response(generate_latest(registry), media_type="text/plain") \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/routers/person.py b/sysom_server/sysom_cluster_health/app/routers/person.py new file mode 100644 index 0000000000000000000000000000000000000000..c8fd0085f490176eca37684bb18c48e462dc6c71 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/routers/person.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from fastapi import APIRouter, Depends +from sqlalchemy.orm import Session +from app.query import PersonQueryParams +from app.database import get_db +from app.crud import get_person_list, get_person_by_name +from app.schemas import Person +from sysom_utils import StandardListResponse, StandardResponse + + +router = APIRouter() + + +@router.get("/get") +async def get_specific_person( + person_name: str, db: Session = Depends(get_db) +): + person = get_person_by_name(db, person_name) + return StandardResponse(person, Person) + +@router.get("/list") +async def get_persons( + query_params: PersonQueryParams = Depends(), db: Session = Depends(get_db) +): + person_list = get_person_list(db, query_params) + return StandardListResponse(person_list, Person) diff --git a/sysom_server/sysom_cluster_health/app/schemas.py b/sysom_server/sysom_cluster_health/app/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..86949263d20b3dd195aff133d1485d4daf3b2156 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/schemas.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from pydantic import BaseModel +from typing import Optional + +########################################################################### +# Define schemas here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/response-model/ +# class Person(BaseModel): +# id: int +# name: str +# age: int +# created_at: datetime + +# class Config: +# orm_mode = True + + +class AbnormalMetricsBase(BaseModel): + metric_id: str + metric_type: str + cluster: str + instance: Optional[str] + namespace: Optional[str] + pod: Optional[str] + score: float + value: float + timestamp: float \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py b/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..60a4e73e4db57cd39ac16b7dfe04d48906b300d5 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py @@ -0,0 +1,148 @@ +CALCULATE_INTERVAL = 30 + +# where to store the abnormal metrics list +# mysql: store in mysql database +# prometheus: expose as prometheus metrics +ABNORMAL_METRIC_STORAGE = "mysql" + +# Health Algorithm for calculating the specific level +# default: default algorithm +# weightedSum: weighted sum algorithm +CLUSTER_ALGORITHM = "default" +NODE_ALGORITHM = "default" +POD_ALGORITHM = "default" + + +################################################################################# +# Cluster Health Metrics +################################################################################# + +CLUSTER_HEALTH_METRICS = [] + +################################################################################# +# Node Health Metrics +################################################################################# + +NODE_HEALTH_METRICS = [ + ########################################## + # Node Saturation Metrics + ######################################### + { + "MetricID": "Node file descriptor util", + "Type": "capacity", + "Weight": 0.2, + }, + { + "MetricID": "Node memory util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node cpu util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node sys util", + "Type": "capacity", + "Weight": 0.3, + }, + { + "MetricID": "Node rootfs util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node rootfs inode util", + "Type": "capacity", + "Weight": 0.2, + }, + ######################################### + # Node load Metrics + ######################################### + { + "MetricID": "Node load average", + "Type": "load", + "Weight": 1.0, + }, + ######################################### + # Node latency Metrics + ######################################### + { + "MetricID": "Node sched latency", + "Type": "latency", + "Weight": 1.0, + }, + ######################################### + # Node Error Metrics + ######################################### + { + "MetricID": "Node OOM count", + "Type": "error", + "Weight": 1.0, + }, +] + +################################################################################# +# Pod Health Metrics +################################################################################# + +POD_HEALTH_METRICS = [ + ######################################### + # Pod Capacity Metrics + ######################################### + { + "MetricID": "Pod memory util", + "Type": "capacity", + "Weight": 0.3, + }, + { + "MetricID": "Pod cpu util", + "Type": "capacity", + "Weight": 0.2, + }, + { + "MetricID": "Pod sys util", + "Type": "capacity", + "Weight": 0.5, + }, + #{ + # "MetricID": "Pod rootfs util", + # "Type": "capacity", + # "Weight": 0.1, + #}, + #{ + # "MetricID": "Pod rootfs inode util", + # "Type": "capacity", + # "Weight": 0.1, + #}, + ######################################### + # Pod Load Metrics + ######################################### + { + "MetricID": "Pod load average", + "Type": "load", + "Weight": 1.0, + }, + ######################################### + # Pod Latency Metrics + ######################################### + { + "MetricID": "Pod memory reclaim latency", + "Type": "latency", + "Weight": 1.0, + }, + ######################################### + # Pod Error Metrics + ######################################### + { + "MetricID": "Pod OOM count", + "Type": "error", + "Weight": 0.5, + }, + { + "MetricID": "Pod memory fail count", + "Type": "error", + "Weight": 0.5, + }, +] \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/collector_settings.py b/sysom_server/sysom_cluster_health/conf/collector_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..853541606f31194f6e9bf583a3f250caf3e9df9f --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/collector_settings.py @@ -0,0 +1,343 @@ +from .common import * + +################################################################################# +# Base Settings +################################################################################# + +# Interval to collect metric from prometheus +COLLECT_INTERVAL = 30 +# If True, use multi-thread to collect and calculate +ENABLE_MULTI_THREAD = False +# Number or thread to use if ENABLE_MULTI_THREAD enable +ANALYZER_PROCESS_NUM = 1 + +################################################################################# +# Global Metric Collecting Settings +################################################################################# + +# the following settings is to specify some label name, in case metric labels changing +CLUSTER_LABEL = "cluster" +POD_LABEL = "pod" +NODE_LABEL = "instance" +NAMESPACE_LABEL = "namespace" +POD_METRIC_TAG = "value" + + +################################################################################# +# Global Alarm and Diagnose Settings +################################################################################# + +# The size of queue which used to send diagnose request between analyzer and diagnose worker +MAX_QUEUE_SIZE = 500 +# Used to merge alarms +ALARM_MERGE_NUM = 10 + +################################################################################# +# Cluster Metrics Settings +################################################################################# + +CLUSTER_METRICS = [ +] + +################################################################################# +# Pod Metrics Settings +################################################################################# + +POD_METRICS = [ + { + "MetricID": "Pod memory util", # description of the metric + "Type": "CapacityMetric", + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memUtil", # table name + "related_value": ["usage", "limit"], # specific metric + "standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100) + }, + "Score": { # settings for calculating metric score + "100": 70, # mem usage >= 70% -- 100分(good) + "70": 80, # mem usage >= 80% -- 70分(warning) + "60": 90, # mem usage >= 90% -- 60分(error) + "0": 100 # mem usage >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Pod cpu util", # description of the metric + "Type": "CapacityMetric", + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_cpuacct_stat", + "related_value": ["total"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 70, # cpu total util >= 70% -- 100分(good) + "70": 80, # cpu total util >= 85% -- 70分(warning) + "60": 90, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Pod sys util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_cpuacct_stat", + "related_value": ["system"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 5, + "70": 10, + "60": 20, + "0": 30, + } + }, + { + "MetricID": "Pod load average", # description of the metric + "Type": "LoadMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_proc_stat", + "related_value": ["r_load1min"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 0.7, + "70": 1, + "60": 5, + "0": 10 + } + }, + { + "MetricID": "Pod memory reclaim latency", # description of the metric + "Type": "LatencyMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memdrcm_latency", + # 延时区间需要按从小到达填写 + "related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms", + "memDrcm_lat_1000ms"], + "standard_type": 2, + }, + "Score": { # settings for calculating metric score + "100": 0, + "70": 100, + "60": 10000, + "0": 100000 + } + }, + #{ + # "MetricID": "Pod memory compact latency", # description of the metric + # "Type": "LatencyMetric", # metric type + # "Collect": { # settings for collecting and preprocessing metric + # "metric_name": "sysom_container_memmcmp_latency", + # 延时区间需要按从小到达填写 + # "related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms", + # "memDcmp_lat_1000ms"], + # "standard_type": 2, + # }, + # "Score": { # settings for calculating metric score + # "100": 0, + # "70": 100, + # "60": 10000, + # } + #}, + { + "MetricID": "Pod OOM count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memory_oomcnt", + "related_value": ["oom_kill"], + "standard_type": 1, + }, + "Score": { + "100": 0, + "60": 1, + "0": 5, + } + }, + { + "MetricID": "Pod memory fail count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memfail_cnt", + "related_value": ["fail_cnt"], + "standard_type": 1, + }, + "Score": { + "100": 0, + "80": 10, + "60": 50, + "0": 100, + } + }, + #{ + # "MetricID": "Pod cpu throttled count", # description of the metric + # "Type": "ErrorMetric", # metric type + # "Collect": { # settings for collecting and preprocessing metric + # "metric_name": "sysom_container_cpu_stat", + # "related_value": ["nr_throttled"], + # "standard_type": 1, + # }, + # "Score": { + # "100": 0, + # "60": 1, + # "0": 5, + # } + #} +] + +################################################################################# +# Nodes Metrics Settings +################################################################################# + +NODE_METRICS = [ + { + "MetricID": "Node file descriptor util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_file_descriptor", + "related_value": ["file-nr", "file-max"], + "node_tag_name": "type", + "standard_type": 0, + "filename": "node_fd_util", + }, + "Score": { # settings for calculating metric score + "100": 40, # fd util >= 50% -- 100分(good) + "60": 60, # fd util >= 85% -- 70分(warning) + "30": 80, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + }, + "Alarm": { # settings for alerting and diagnosing + "threshold": 30, + "diagnose_type": "custom", + "service_name": "command" + } + }, + { + "MetricID": "Node memory util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_meminfo", # table name + "related_value": ["MemAvailable", "MemTotal"], # specific metric + "node_tag_name": "value", + "standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100) + }, + "Score": { # settings for calculating metric score + "100": 70, # mem usage >= 70% -- 100分(good) + "70": 80, # mem usage >= 80% -- 70分(warning) + "60": 90, # mem usage >= 90% -- 60分(error) + "0": 100 # mem usage >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node cpu util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_cpu_total", + "related_value": ["idle"], + "node_tag_name": "mode", + "standard_type": 0, + "filename": "node_cpu_util", + }, + "Score": { # settings for calculating metric score + "100": 70, # cpu total util >= 70% -- 100分(good) + "70": 80, # cpu total util >= 85% -- 70分(warning) + "60": 90, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node sys util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_cpu_total", + "related_value": ["sys"], + "node_tag_name": "mode", + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 5, + "70": 10, + "60": 20, + "0": 30 + } + }, + { + "MetricID": "Node rootfs util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_fs_stat", + "related_value": ["f_bavail", "f_blocks", "f_bfree"], + "node_tag_name": "counter", + "standard_type": 0, + "filename": "node_rootfs_util", + }, + "Score": { # settings for calculating metric score + "100": 50, + "70": 70, + "60": 90, + "0": 95 + } + }, + { + "MetricID": "Node rootfs inode util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_fs_stat", + "related_value": ["f_favail", "f_files"], + "node_tag_name": "counter", + "standard_type": 0, + "filename": "node_rootfs_inode_util", + }, + "Score": { # settings for calculating metric score + "100": 50, + "70": 70, + "60": 90, + "0": 95 + } + }, + { + "MetricID": "Node load average", # description of the metric + "Type": "LoadMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_loadavg", + "related_value": ["load1"], + "node_tag_name": "value", + "standard_type": 0, + "filename": "node_load_avg", + }, + "Score": { # settings for calculating metric score + "100": 1, # cpu total util >= 70% -- 100分(good) + "70": 5, # cpu total util >= 85% -- 70分(warning) + "60": 10, # cpu total util >= 90% -- 60分(error) + "0": 20 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node sched latency", # description of the metric + "Type": "LatencyMetric", # metric type + "Collect": { + "metric_name": "sysom_cpu_dist", + "related_value": ["ms10","ms100","s1"], + "node_tag_name": "value", + "standard_type": 2, + }, + "Score": { + "100": 40, # cpu total util >= 70% -- 100分(good) + "70": 100, # cpu total util >= 85% -- 70分(warning) + "30": 150, # cpu total util >= 90% -- 60分(error) + "0": 200 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node OOM count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_vmstat", + "related_value": ["oom_kill"], + "node_tag_name": "value", + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 0, + "60": 1, + "0": 5, + } + } +] \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/common.py b/sysom_server/sysom_cluster_health/conf/common.py new file mode 100644 index 0000000000000000000000000000000000000000..4b280c6496d96e783d2f3b30bc27de0a584981c0 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/common.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File common.py +Description: +""" +from pathlib import Path +from sysom_utils import ConfigParser, SysomFramework + +BASE_DIR = Path(__file__).resolve().parent.parent + +################################################################## +# Load yaml config first +################################################################## +YAML_GLOBAL_CONFIG_PATH = f"{BASE_DIR.parent.parent}/conf/config.yml" +YAML_SERVICE_CONFIG_PATH = f"{BASE_DIR}/config.yml" + +YAML_CONFIG = ConfigParser(YAML_GLOBAL_CONFIG_PATH, YAML_SERVICE_CONFIG_PATH) + +mysql_config = YAML_CONFIG.get_server_config().db.mysql +service_config = YAML_CONFIG.get_service_config() + +SysomFramework.init(YAML_CONFIG) + +################################################################## +# fastapi config +################################################################## +SQLALCHEMY_DATABASE_URL = ( + f"{mysql_config.dialect}+{mysql_config.engine}://{mysql_config.user}:{mysql_config.password}@" + f"{mysql_config.host}:{mysql_config.port}/{mysql_config.database}" +) + +################################################################## +# Cec settings +################################################################## +# 健康度接收SYSOM_HEALTH_METRIC格式异常指标主题 +CEC_TOPIC_SYSOM_HEALTH_METRIC = "SYSOM_HEALTH_METRIC" + +################################################################## +# gcache settings +################################################################## +CLUSTER_HEALTH_METRIC_GCACHE = "cluster_health_metrics" +NODE_HEALTH_METRIC_GCACHE = "node_health_metrics" +POD_HEALTH_METRIC_GCACHE = "pod_health_metrics" +CLUSTER_METRIC_EXPORTER = "cluster_metric_exporter" +NODE_METRIC_EXPORTER = "node_metric_exporter" +POD_METRIC_EXPORTER = "pod_metric_exporter" \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/develop.py b/sysom_server/sysom_cluster_health/conf/develop.py new file mode 100644 index 0000000000000000000000000000000000000000..10de7bfa9db72e095f5847119a877f0f66b37200 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/develop.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File develoop.py +Description: +""" +from .common import * + +''' +开发环境配置项 +''' + +DEBUG = True \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/gunicorn.py b/sysom_server/sysom_cluster_health/conf/gunicorn.py new file mode 100644 index 0000000000000000000000000000000000000000..2ef538954eaf7915b371183c0799cad257291222 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/gunicorn.py @@ -0,0 +1,23 @@ +''' +Channel Service Gunicorn Settings +''' +from conf.common import YAML_CONFIG + +bind = YAML_CONFIG.get_service_config().get("bind", "127.0.0.1") +port = YAML_CONFIG.get_service_config().get("port", "80") + +workers = 2 # 指定工作进程数 + +threads = 3 + +bind = f'{bind}:{port}' + +worker_class = 'uvicorn.workers.UvicornWorker' # 工作模式线程, 默认为sync模式 + +max_requests = 2000 # 设置最大并发数量为2000 (每个worker处理请求的工作线程) + +accesslog = '/var/log/sysom/sysom-cluster_health-access.log' + +loglevel = 'error' + +proc_name = 'sysom_cluster_health_service' diff --git a/sysom_server/sysom_cluster_health/conf/product.py b/sysom_server/sysom_cluster_health/conf/product.py new file mode 100644 index 0000000000000000000000000000000000000000..5d84aad58e8ec8d7b9f444753779a1e89823f070 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/product.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File product.py +Description: +""" +from .common import * + +''' +生产环境配置项 +''' + +DEBUG = False diff --git a/sysom_server/sysom_cluster_health/conf/settings.py b/sysom_server/sysom_cluster_health/conf/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..4bc3d893dfab41ce9118b656e3f26b36ca6f419b --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/settings.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File settings.py +Description: +""" +import os + +env = os.environ.get("env", "product") + + +if env == "develop": + from .develop import * +elif env == "testing": + from .testing import * +elif env == "product": + from .product import * + +from .collector_settings import * +from .clusterhealth_settings import * + +# Prometheus to collect metrics +PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus +# No Cluster Label in metric, assume all metric is in one cluster +NO_CLUSTER_LABEL = True \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/testing.py b/sysom_server/sysom_cluster_health/conf/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..4dac2666ac26c853fac77689f318f3292c87c297 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/testing.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/11/29 10:08 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File testing.py +Description: +""" +from .common import * + +''' +测试环境配置项 +''' +DEBUG = True diff --git a/sysom_server/sysom_cluster_health/config.yml b/sysom_server/sysom_cluster_health/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..44504ac91c5e7d0530ff9d363c6b00dce7d4ecc3 --- /dev/null +++ b/sysom_server/sysom_cluster_health/config.yml @@ -0,0 +1,35 @@ +vars: + SERVICE_NAME: &SERVICE_NAME sysom_cluster_health + SERVICE_CONSUMER_GROUP: + !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] + +sysom_server: + cec: + consumer_group: &SYSOM_CEC_CHANNEL_CONSUMER_GROUP + +sysom_service: + service_name: *SERVICE_NAME + service_dir: *SERVICE_NAME + protocol: http + host: 127.0.0.1 + bind: 127.0.0.1 + port: 7020 + framework: + gcache: + protocol: redis + node_dispatch: + cmg: + tags: + - cluster_health + - FastApi + # Metadata of service + metadata: + check: + type: http + url: "/api/v1/cluster_health/health/check" + interval: 10 + timeout: 10 + deregister: 25 + header: + tls_skip_verify: false + diff --git a/sysom_server/sysom_cluster_health/lib/README.md b/sysom_server/sysom_cluster_health/lib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ec74424ba6ecde5d035d0c113bafb0ddc6e4cfa --- /dev/null +++ b/sysom_server/sysom_cluster_health/lib/README.md @@ -0,0 +1 @@ +The current directory holds the public libraries or utils needed for microservices \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/lib/common_type.py b/sysom_server/sysom_cluster_health/lib/common_type.py new file mode 100644 index 0000000000000000000000000000000000000000..c1698ad62b39aba3c8ae7d619d223578e227da3d --- /dev/null +++ b/sysom_server/sysom_cluster_health/lib/common_type.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Optional + +@dataclass +class Labels: + cluster: str + instance: Optional[str] + namespace: Optional[str] + pod: Optional[str] + + def __init__(self, cluster, instance=None, namespace=None, pod=None): + self.cluster = cluster + self.instance = instance + self.namespace = namespace + self.pod = pod + +class Level(Enum): + Cluster = "cluster" + Node = "node" + Pod = "pod" \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/lib/utils.py b/sysom_server/sysom_cluster_health/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..91a535f142808cff9ec5b5192dba24df79c666ac --- /dev/null +++ b/sysom_server/sysom_cluster_health/lib/utils.py @@ -0,0 +1,104 @@ +import time +import conf.settings as settings +from typing import List, Tuple +from metric_reader.metric_reader import MetricReader, RangeQueryTask +from clogger import logger + +INSTANCES_VRAIABLE = "sysom_proc_cpu_total" +PODS_VARIABLE = "sysom_container_memory_oomcnt" +NODE_LABELS = settings.NODE_LABEL +POD_LABELS = settings.POD_LABEL +NAMESPACE_LABELS = settings.NAMESPACE_LABEL + + +def collect_all_clusters(metric_reader: MetricReader) -> List[str]: + cluster_list = [] + res = metric_reader.get_label_values("cluster") + if len(res.data) <= 0: + logger.error("Collect all cluster failed!") + return cluster_list + return [item for item in res.data] + + +def collect_instances_of_cluster(cluster_id: str, metric_reader: MetricReader, + interval: int) -> List[str]: + """ Collect all instances of specific cluster + + Use "sysom_proc_cpu_total" metric to collect all pods + of specific instance, need to make sure the metric has been correctlly + exported (similar to grafana variables). + + Args: + instance_id: instance id + metric_reader: MetricReader instance of metric_reader sdk + interval: time interval of query + + Returns: + List of instances + """ + + instances_list = [] + + task = RangeQueryTask(INSTANCES_VRAIABLE, + start_time=time.time() - interval, + end_time=time.time()) \ + .append_equal_filter("mode", "total") \ + + if cluster_id != "default": + task.append_equal_filter("cluster", cluster_id) + + node_metric_res = metric_reader.range_query([task]) + if len(node_metric_res.data) <= 0: + logger.error( + f"Collect instances of {cluster_id} info: no instances found!") + return instances_list + + try: + for i in range(len(node_metric_res.data)): + labels = node_metric_res.data[i].to_dict()["labels"] + if NODE_LABELS in labels: + instances_list.append(labels[NODE_LABELS]) + except Exception as e: + raise e + + return list(set(instances_list)) + + +def collect_pods_of_instance(instance_id: str, metric_reader: MetricReader, + interval: int) -> List[Tuple[str, str]]: + """ Collect all pods of specific instance + + Use "sysom_container_memory_oomcnt" metric to collect all pods + of specific instance, need to make sure the metric has been correctlly + exported. + + Args: + instance_id: instance id + metric_reader: MetricReader instance of metric_reader sdk + interval: time interval of query + + Returns: + List of (pod name and namespace) + """ + + pod_list = [] + task = RangeQueryTask(PODS_VARIABLE, + start_time=time.time() - interval, + end_time=time.time()) \ + .append_equal_filter(NODE_LABELS, instance_id) + pod_metric_res = metric_reader.range_query([task]) + if len(pod_metric_res.data) <= 0: + logger.error(f"Collect pod of {instance_id} info: no pod found!") + return pod_list + + try: + for i in range(len(pod_metric_res.data)): + labels = pod_metric_res.data[i].to_dict()["labels"] + if POD_LABELS in labels and NAMESPACE_LABELS in labels: + pod_list.append( + (labels[POD_LABELS], labels[NAMESPACE_LABELS]) + ) + except Exception as e: + raise e + + return list(set(pod_list)) diff --git a/sysom_server/sysom_cluster_health/main.py b/sysom_server/sysom_cluster_health/main.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc013db2f012fa7c00222af910334ca8c3d7e0d --- /dev/null +++ b/sysom_server/sysom_cluster_health/main.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- # +from conf.settings import * +from os import getpid +from multiprocessing import Queue +from clogger import logger +from fastapi import FastAPI +from conf.settings import YAML_CONFIG +from sysom_utils import CmgPlugin, SysomFramework +from app.collector.metric_manager import MetricManager +from app.routers import health, metrics +from app.collector.collector import Collector +from app.health_calculator.calculator import HealthCalculator +from app.diagnose.diagnose_worker import DiagnoseWorker +from app.consumer.consumer import HealthMetricListener +from app.crud import del_all_abnormal_metrics_data +from app.database import SessionLocal + +app = FastAPI() + +app.include_router(health.router, prefix="/api/v1/cluster_health/health") +app.include_router(metrics.router) + +############################################################################# +# Write your API interface here, or add to app/routes +############################################################################# + + +def init_framwork(): + SysomFramework\ + .init(YAML_CONFIG) \ + .load_plugin_cls(CmgPlugin) \ + .start() + logger.info("SysomFramework init finished!") + + +def cleanup_gcache_data(): + gcache_names = [ + CLUSTER_HEALTH_METRIC_GCACHE, + NODE_HEALTH_METRIC_GCACHE, + POD_HEALTH_METRIC_GCACHE, + CLUSTER_METRIC_EXPORTER, + NODE_METRIC_EXPORTER, + POD_METRIC_EXPORTER + ] + + for name in gcache_names: + gcache = SysomFramework.gcache(name) + gcache.clean() + + if ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + del_all_abnormal_metrics_data(db) + +@app.on_event("startup") +async def on_start(): + init_framwork() + + cleanup_gcache_data() + + # load all registered metrics from settings + metric_manager = MetricManager() + metric_manager.metric_register() + + diagnose_queue = Queue(maxsize=MAX_QUEUE_SIZE) + pid = getpid(); + + # start analyzer to collect and calculate health score + try: + Collector( + queue=diagnose_queue, + metric_manager=metric_manager, + parent_pid=pid + ).start() + + HealthCalculator( + parent_pid=pid + ).start() + + DiagnoseWorker( + metric_manager=metric_manager, + queue=diagnose_queue, + parent_pid=pid).start() + + HealthMetricListener().start() + logger.info("集群健康度定时任务已启动") + except Exception as e: + logger.exception(e) + + ########################################################################## + # Perform some microservice initialization operations over here + ########################################################################## + + +@app.on_event("shutdown") +async def on_shutdown(): + cleanup_gcache_data() diff --git a/sysom_server/sysom_cmg/alembic.ini b/sysom_server/sysom_cmg/alembic.ini new file mode 100644 index 0000000000000000000000000000000000000000..f6ab9febcd93add9d0ea8857f857d7f30f1fe48f --- /dev/null +++ b/sysom_server/sysom_cmg/alembic.ini @@ -0,0 +1,102 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library that can be +# installed by adding `alembic[tz]` to the pip requirements +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = "" + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/sysom_server/sysom_cmg/alembic/README b/sysom_server/sysom_cmg/alembic/README new file mode 100644 index 0000000000000000000000000000000000000000..98e4f9c44effe479ed38c66ba922e7bcc672916f --- /dev/null +++ b/sysom_server/sysom_cmg/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/sysom_server/sysom_cmg/alembic/env.py b/sysom_server/sysom_cmg/alembic/env.py new file mode 100644 index 0000000000000000000000000000000000000000..19c7e30c64f87d2d16ed5b3ecc2ed32cbbc15154 --- /dev/null +++ b/sysom_server/sysom_cmg/alembic/env.py @@ -0,0 +1,115 @@ +import inspect +import app.models as models +from logging.config import fileConfig +from sqlalchemy import engine_from_config, Table +from sqlalchemy import pool +from app.models import Base +from alembic import context +from conf.settings import YAML_CONFIG, SQLALCHEMY_DATABASE_URL + +################################################################## +# Load yaml config first +################################################################## +mysql_config = YAML_CONFIG.get_server_config().db.mysql + +################################################################## +# Scan models +################################################################## +service_tables = [] +for name, data in inspect.getmembers(models): + if inspect.isclass(data): + if data.__module__ != "app.models": + continue + if "__tablename__" in data.__dict__: + service_tables.append(data.__dict__["__tablename__"]) + elif "__table__" in data.__dict__: + service_tables.append(data.__dict__["__table__"]) + elif isinstance(data, Table): + service_tables.append(name) + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# Update mysql config according config.yml +config.set_main_option( + "sqlalchemy.url", + SQLALCHEMY_DATABASE_URL +) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + +def include_object(object, name, type_, reflected, compare_to): + if type_ == "table" and name not in service_tables: + return False + return True + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + include_object=include_object, + version_table="cmg_version", + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + include_object=include_object, + version_table="cmg_version" + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/sysom_server/sysom_cmg/alembic/script.py.mako b/sysom_server/sysom_cmg/alembic/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..2c0156303a8df3ffdc9de87765bf801bf6bea4a5 --- /dev/null +++ b/sysom_server/sysom_cmg/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/sysom_server/sysom_cmg/alembic/versions/.gitkeep b/sysom_server/sysom_cmg/alembic/versions/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_cmg/app/__init__.py b/sysom_server/sysom_cmg/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..30e79558304b508c489edd0fd7b541ee4af47f58 --- /dev/null +++ b/sysom_server/sysom_cmg/app/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File __init__.py +Description: +""" \ No newline at end of file diff --git a/sysom_server/sysom_cmg/app/crud.py b/sysom_server/sysom_cmg/app/crud.py new file mode 100644 index 0000000000000000000000000000000000000000..08d529f5276f383912dfa63a096e5f3d0df15621 --- /dev/null +++ b/sysom_server/sysom_cmg/app/crud.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File crud.py +Description: +""" +from typing import Optional, List +from sqlalchemy.orm import Session +from app import models, schemas, query + +################################################################################################ +# Define database crud here +################################################################################################ + +# def get_person_by_name(db: Session, name: str) -> Optional[models.Person]: +# return db.query(models.Person).filter(models.Person.name == name).first() + +# def create_person(db: Session, person: schemas.Person) -> models.Person: +# person = models.Person(**person.dict()) +# db.add(person) +# db.commit() +# db.refresh(person) +# return person + +# def del_person_by_id(db: Session, person_id: int): +# person = db.get(models.Person, person_id) +# db.delete(person) +# db.commit() + +# def get_person_list(db: Session, query_params: query.PersonQueryParams) -> List[models.Person]: +# return ( +# query_params.get_query_exp(db) +# .all() +# ) diff --git a/sysom_server/sysom_cmg/app/database.py b/sysom_server/sysom_cmg/app/database.py new file mode 100644 index 0000000000000000000000000000000000000000..773a7ada72834d1f9e6d6c3be3bfc681ca00877a --- /dev/null +++ b/sysom_server/sysom_cmg/app/database.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File database.py +Description: +""" +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from conf.settings import SQLALCHEMY_DATABASE_URL +from sysom_utils import FastApiResponseHelper + +engine = create_engine( + SQLALCHEMY_DATABASE_URL, connect_args={} +) + +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + +Base = declarative_base() + +FastApiResponseHelper.bind_base_class(Base) \ No newline at end of file diff --git a/sysom_server/sysom_cmg/app/models.py b/sysom_server/sysom_cmg/app/models.py new file mode 100644 index 0000000000000000000000000000000000000000..7e2a989f3aa1907d390afd613b1338087af6003c --- /dev/null +++ b/sysom_server/sysom_cmg/app/models.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File models.py +Description: +""" +from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy.sql import func +from app.database import Base + + +########################################################################### +# Define databse model here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/sql-databases/ +# class Person(Base): +# __tablename__ = "sys_person" +# id = Column(Integer, primary_key=True) +# name = Column(String(254), unique=True) +# age = Column(Integer) +# created_at = Column(DateTime(timezone=True), server_default=func.now()) \ No newline at end of file diff --git a/sysom_server/sysom_cmg/app/query.py b/sysom_server/sysom_cmg/app/query.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbcd0898fc8d3f2be8d64ce694d7aa99ccb2332 --- /dev/null +++ b/sysom_server/sysom_cmg/app/query.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/09/19 15:41 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from typing import Optional +from app import models +from sysom_utils import BaseQueryParams + + +# class PersonQueryParams(BaseQueryParams): + +# # 1. 指定要查询的模型 +# __modelclass__ = models.Person + +# # 2. 定义排序字段 +# sort: str = "-created_at" + +# # 3. 定义支持用于过滤的参数 +# name: Optional[str] = None +# age: Optional[str] = None + +# # 4. 指定哪些字段是枚举类型,并且指明对应的枚举类 +# __enum_fields__ = { +# } \ No newline at end of file diff --git a/sysom_server/sysom_cmg/app/routers/health.py b/sysom_server/sysom_cmg/app/routers/health.py new file mode 100644 index 0000000000000000000000000000000000000000..8bc7ca8b30990634c2bbe6bddd026b5209a1b08b --- /dev/null +++ b/sysom_server/sysom_cmg/app/routers/health.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from fastapi import APIRouter + + +router = APIRouter() + + +@router.get("/check") +async def get_channel_config(): + return { + "code": 0, + "err_msg": "", + "data": "" + } diff --git a/sysom_server/sysom_cmg/app/routers/person.py b/sysom_server/sysom_cmg/app/routers/person.py new file mode 100644 index 0000000000000000000000000000000000000000..822d6e02958587f71db1edb2ed93385c3430ab05 --- /dev/null +++ b/sysom_server/sysom_cmg/app/routers/person.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from fastapi import APIRouter, Depends +from sqlalchemy.orm import Session +from app.query import PersonQueryParams +from app.database import get_db +from app.crud import get_person_list, get_person_by_name +from app.schemas import Person +from sysom_utils import StandardListResponse, StandardResponse + + +router = APIRouter() + + +@router.get("/get") +async def get_specific_person( + person_name: str, db: Session = Depends(get_db) +): + person = get_person_by_name(db, person_name) + return StandardResponse(person, Person) + +@router.get("/list") +async def get_persons( + query_params: PersonQueryParams = Depends(), db: Session = Depends(get_db) +): + person_list = get_person_list(db, query_params) + return StandardListResponse(person_list, Person) diff --git a/sysom_server/sysom_cmg/app/routers/serivces.py b/sysom_server/sysom_cmg/app/routers/serivces.py new file mode 100644 index 0000000000000000000000000000000000000000..7bcad61986e4f589a0bf61d57f66fa3ab395affd --- /dev/null +++ b/sysom_server/sysom_cmg/app/routers/serivces.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +''' +@Author: wb-msm241621 +@Date: 2024-01-31 17:59:13 +@LastEditTime: 2024-01-31 17:59:13 +@Description: +''' +from fastapi import APIRouter +from conf.settings import * +from cmg_base import dispatch_service_discovery +from sysom_utils import StandardResponse, StandardListResponse +from ..schemas import ServiceItemModel + +router = APIRouter() + + +discovery = dispatch_service_discovery(YAML_CONFIG.get_cmg_url()) + + +@router.get("/list") +async def list_services(): + try: + services = discovery.get_services() + servicesResponseList = [ + ServiceItemModel( + service_name=service, + count=discovery.get_instance_count(service) + ) for service in services + ] + except Exception as exc: + return StandardResponse.error(str(exc)) + return StandardListResponse.success(servicesResponseList) diff --git a/sysom_server/sysom_cmg/app/schemas.py b/sysom_server/sysom_cmg/app/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..f3cf5db5cb0ccfb0af81a4dfe862fc001450f5f2 --- /dev/null +++ b/sysom_server/sysom_cmg/app/schemas.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from pydantic import BaseModel +from datetime import datetime + +########################################################################### +# Define schemas here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/response-model/ +# class Person(BaseModel): +# id: int +# name: str +# age: int +# created_at: datetime + +# class Config: +# orm_mode = True + + +class ServiceItemModel(BaseModel): + service_name: str + count: int diff --git a/sysom_server/sysom_cmg/conf/common.py b/sysom_server/sysom_cmg/conf/common.py new file mode 100644 index 0000000000000000000000000000000000000000..e89384c4d99169d6e126c5385d421812b2daa601 --- /dev/null +++ b/sysom_server/sysom_cmg/conf/common.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File common.py +Description: +""" +from pathlib import Path +from sysom_utils import ConfigParser, SysomFramework + +BASE_DIR = Path(__file__).resolve().parent.parent + +################################################################## +# Load yaml config first +################################################################## +YAML_GLOBAL_CONFIG_PATH = f"{BASE_DIR.parent.parent}/conf/config.yml" +YAML_SERVICE_CONFIG_PATH = f"{BASE_DIR}/config.yml" + +YAML_CONFIG = ConfigParser(YAML_GLOBAL_CONFIG_PATH, YAML_SERVICE_CONFIG_PATH) + +mysql_config = YAML_CONFIG.get_server_config().db.mysql +service_config = YAML_CONFIG.get_service_config() + +SysomFramework.init(YAML_CONFIG) + +################################################################## +# fastapi config +################################################################## +SQLALCHEMY_DATABASE_URL = ( + f"{mysql_config.dialect}+{mysql_config.engine}://{mysql_config.user}:{mysql_config.password}@" + f"{mysql_config.host}:{mysql_config.port}/{mysql_config.database}" +) \ No newline at end of file diff --git a/sysom_server/sysom_cmg/conf/develop.py b/sysom_server/sysom_cmg/conf/develop.py new file mode 100644 index 0000000000000000000000000000000000000000..dea9392e7c342c168133ac4fbae72ce3306fe0fc --- /dev/null +++ b/sysom_server/sysom_cmg/conf/develop.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File develoop.py +Description: +""" +from .common import * + +''' +开发环境配置项 +''' + +DEBUG = True \ No newline at end of file diff --git a/sysom_server/sysom_cmg/conf/gunicorn.py b/sysom_server/sysom_cmg/conf/gunicorn.py new file mode 100644 index 0000000000000000000000000000000000000000..b7234e4647320b90bf01f4920a6e7752747556aa --- /dev/null +++ b/sysom_server/sysom_cmg/conf/gunicorn.py @@ -0,0 +1,23 @@ +''' +Channel Service Gunicorn Settings +''' +from conf.common import YAML_CONFIG + +bind = YAML_CONFIG.get_service_config().get("bind", "127.0.0.1") +port = YAML_CONFIG.get_service_config().get("port", "80") + +workers = 2 # 指定工作进程数 + +threads = 3 + +bind = f'{bind}:{port}' + +worker_class = 'uvicorn.workers.UvicornWorker' # 工作模式线程, 默认为sync模式 + +max_requests = 2000 # 设置最大并发数量为2000 (每个worker处理请求的工作线程) + +accesslog = '/var/log/sysom/sysom-cmg-access.log' + +loglevel = 'error' + +proc_name = 'sysom_cmg_service' diff --git a/sysom_server/sysom_cmg/conf/product.py b/sysom_server/sysom_cmg/conf/product.py new file mode 100644 index 0000000000000000000000000000000000000000..2c9c65541bc23c8ab4559c67eb9c2e29525bca6c --- /dev/null +++ b/sysom_server/sysom_cmg/conf/product.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File product.py +Description: +""" +from .common import * + +''' +生产环境配置项 +''' + +DEBUG = False diff --git a/sysom_server/sysom_cmg/conf/settings.py b/sysom_server/sysom_cmg/conf/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..1ceea45f13218128244c79455ab87134eaecee87 --- /dev/null +++ b/sysom_server/sysom_cmg/conf/settings.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File settings.py +Description: +""" +import os + +env = os.environ.get("env", "product") + + +if env == "develop": + from .develop import * +elif env == "testing": + from .testing import * +elif env == "product": + from .product import * \ No newline at end of file diff --git a/sysom_server/sysom_cmg/conf/testing.py b/sysom_server/sysom_cmg/conf/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..46de2e5ce02c6cc90b5aeb9438181a88f2681dfd --- /dev/null +++ b/sysom_server/sysom_cmg/conf/testing.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File testing.py +Description: +""" +from .common import * + +''' +测试环境配置项 +''' +DEBUG = True diff --git a/sysom_server/sysom_cmg/config.yml b/sysom_server/sysom_cmg/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..c108f28ac6e042ac46e8b0351de142c198f0e8ad --- /dev/null +++ b/sysom_server/sysom_cmg/config.yml @@ -0,0 +1,35 @@ +vars: + SERVICE_NAME: &SERVICE_NAME sysom_cmg + SERVICE_CONSUMER_GROUP: + !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] + +sysom_server: + cec: + consumer_group: *SERVICE_CONSUMER_GROUP + +sysom_service: + service_name: *SERVICE_NAME + service_dir: *SERVICE_NAME + protocol: http + host: 127.0.0.1 + bind: 127.0.0.1 + port: 7023 + framework: + gcache: + protocol: redis + node_dispatch: + cmg: + tags: + - cmg + - FastApi + # Metadata of service + metadata: + check: + type: http + url: "/api/v1/cmg/health/check" + interval: 10 + timeout: 10 + deregister: 25 + header: + tls_skip_verify: false + diff --git a/sysom_server/sysom_cmg/lib/README.md b/sysom_server/sysom_cmg/lib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ec74424ba6ecde5d035d0c113bafb0ddc6e4cfa --- /dev/null +++ b/sysom_server/sysom_cmg/lib/README.md @@ -0,0 +1 @@ +The current directory holds the public libraries or utils needed for microservices \ No newline at end of file diff --git a/sysom_server/sysom_cmg/main.py b/sysom_server/sysom_cmg/main.py new file mode 100644 index 0000000000000000000000000000000000000000..ad4b9928077a0f9ad651ab84d9c14c7878e1c426 --- /dev/null +++ b/sysom_server/sysom_cmg/main.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- # +""" +Time 2024/01/31 17:58 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File ssh.py +Description: +""" +from clogger import logger +from fastapi import FastAPI +from app.routers import health, serivces +from conf.settings import YAML_CONFIG +from sysom_utils import CmgPlugin, SysomFramework + + +app = FastAPI() + +app.include_router(health.router, prefix="/api/v1/cmg/health") +app.include_router(serivces.router, prefix="/api/v1/cmg/services") +# app.include_router(health.router, prefix="/api/v1/cmg/person") + + +############################################################################# +# Write your API interface here, or add to app/routes +############################################################################# + + +def init_framwork(): + SysomFramework\ + .init(YAML_CONFIG) \ + .load_plugin_cls(CmgPlugin) \ + .start() + logger.info("SysomFramework init finished!") + + +@app.on_event("startup") +async def on_start(): + init_framwork() + + ############################################################################# + # Perform some microservice initialization operations over here + ############################################################################# + + +@app.on_event("shutdown") +async def on_shutdown(): + pass \ No newline at end of file diff --git a/sysom_server/sysom_colocation/alembic.ini b/sysom_server/sysom_colocation/alembic.ini new file mode 100644 index 0000000000000000000000000000000000000000..f6ab9febcd93add9d0ea8857f857d7f30f1fe48f --- /dev/null +++ b/sysom_server/sysom_colocation/alembic.ini @@ -0,0 +1,102 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library that can be +# installed by adding `alembic[tz]` to the pip requirements +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = "" + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/sysom_server/sysom_colocation/alembic/README b/sysom_server/sysom_colocation/alembic/README new file mode 100644 index 0000000000000000000000000000000000000000..98e4f9c44effe479ed38c66ba922e7bcc672916f --- /dev/null +++ b/sysom_server/sysom_colocation/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/sysom_server/sysom_colocation/alembic/env.py b/sysom_server/sysom_colocation/alembic/env.py new file mode 100644 index 0000000000000000000000000000000000000000..9147fc27b84dedd1d9adee57aef48f3734accdfd --- /dev/null +++ b/sysom_server/sysom_colocation/alembic/env.py @@ -0,0 +1,115 @@ +import inspect +import app.models as models +from logging.config import fileConfig +from sqlalchemy import engine_from_config, Table +from sqlalchemy import pool +from app.models import Base +from alembic import context +from conf.settings import YAML_CONFIG, SQLALCHEMY_DATABASE_URL + +################################################################## +# Load yaml config first +################################################################## +mysql_config = YAML_CONFIG.get_server_config().db.mysql + +################################################################## +# Scan models +################################################################## +service_tables = [] +for name, data in inspect.getmembers(models): + if inspect.isclass(data): + if data.__module__ != "app.models": + continue + if "__tablename__" in data.__dict__: + service_tables.append(data.__dict__["__tablename__"]) + elif "__table__" in data.__dict__: + service_tables.append(data.__dict__["__table__"]) + elif isinstance(data, Table): + service_tables.append(name) + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# Update mysql config according config.yml +config.set_main_option( + "sqlalchemy.url", + SQLALCHEMY_DATABASE_URL +) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + +def include_object(object, name, type_, reflected, compare_to): + if type_ == "table" and name not in service_tables: + return False + return True + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + include_object=include_object, + version_table="colocation_version", + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata, + include_object=include_object, + version_table="colocation_version" + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/sysom_server/sysom_colocation/alembic/script.py.mako b/sysom_server/sysom_colocation/alembic/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..2c0156303a8df3ffdc9de87765bf801bf6bea4a5 --- /dev/null +++ b/sysom_server/sysom_colocation/alembic/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/sysom_server/sysom_colocation/alembic/versions/.gitkeep b/sysom_server/sysom_colocation/alembic/versions/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_colocation/app/__init__.py b/sysom_server/sysom_colocation/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a6faa2f617cfb171a342f1750c00d65136605914 --- /dev/null +++ b/sysom_server/sysom_colocation/app/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File __init__.py +Description: +""" \ No newline at end of file diff --git a/sysom_server/sysom_colocation/app/crud.py b/sysom_server/sysom_colocation/app/crud.py new file mode 100644 index 0000000000000000000000000000000000000000..4af610a6f88bdbfa0623f7110724cdaa2130fad7 --- /dev/null +++ b/sysom_server/sysom_colocation/app/crud.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File crud.py +Description: +""" +from typing import Optional, List +from sqlalchemy.orm import Session +from app import models, schemas, query + +################################################################################################ +# Define database crud here +################################################################################################ + +# def get_person_by_name(db: Session, name: str) -> Optional[models.Person]: +# return db.query(models.Person).filter(models.Person.name == name).first() + +# def create_person(db: Session, person: schemas.Person) -> models.Person: +# person = models.Person(**person.dict()) +# db.add(person) +# db.commit() +# db.refresh(person) +# return person + +# def del_person_by_id(db: Session, person_id: int): +# person = db.get(models.Person, person_id) +# db.delete(person) +# db.commit() + +# def get_person_list(db: Session, query_params: query.PersonQueryParams) -> List[models.Person]: +# return ( +# query_params.get_query_exp(db) +# .all() +# ) diff --git a/sysom_server/sysom_colocation/app/database.py b/sysom_server/sysom_colocation/app/database.py new file mode 100644 index 0000000000000000000000000000000000000000..0ecf87c503d3ddf7841791bcab5af0e0ed4508d3 --- /dev/null +++ b/sysom_server/sysom_colocation/app/database.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File database.py +Description: +""" +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker +from conf.settings import SQLALCHEMY_DATABASE_URL +from sysom_utils import FastApiResponseHelper + +engine = create_engine( + SQLALCHEMY_DATABASE_URL, connect_args={} +) + +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() + +Base = declarative_base() + +FastApiResponseHelper.bind_base_class(Base) \ No newline at end of file diff --git a/sysom_server/sysom_colocation/app/models.py b/sysom_server/sysom_colocation/app/models.py new file mode 100644 index 0000000000000000000000000000000000000000..c20367d90f96fa97c5120a109b1bb10a35887c42 --- /dev/null +++ b/sysom_server/sysom_colocation/app/models.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File models.py +Description: +""" +from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy.sql import func +from app.database import Base + + +########################################################################### +# Define databse model here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/sql-databases/ +# class Person(Base): +# __tablename__ = "sys_person" +# id = Column(Integer, primary_key=True) +# name = Column(String(254), unique=True) +# age = Column(Integer) +# created_at = Column(DateTime(timezone=True), server_default=func.now()) \ No newline at end of file diff --git a/sysom_server/sysom_colocation/app/query.py b/sysom_server/sysom_colocation/app/query.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbcd0898fc8d3f2be8d64ce694d7aa99ccb2332 --- /dev/null +++ b/sysom_server/sysom_colocation/app/query.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/09/19 15:41 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from typing import Optional +from app import models +from sysom_utils import BaseQueryParams + + +# class PersonQueryParams(BaseQueryParams): + +# # 1. 指定要查询的模型 +# __modelclass__ = models.Person + +# # 2. 定义排序字段 +# sort: str = "-created_at" + +# # 3. 定义支持用于过滤的参数 +# name: Optional[str] = None +# age: Optional[str] = None + +# # 4. 指定哪些字段是枚举类型,并且指明对应的枚举类 +# __enum_fields__ = { +# } \ No newline at end of file diff --git a/sysom_server/sysom_colocation/app/routers/health.py b/sysom_server/sysom_colocation/app/routers/health.py new file mode 100644 index 0000000000000000000000000000000000000000..e994f504a728416b48b63b0f4274caed614d858e --- /dev/null +++ b/sysom_server/sysom_colocation/app/routers/health.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File health.py +Description: +""" +from fastapi import APIRouter + + +router = APIRouter() + + +@router.get("/check") +async def get_channel_config(): + return { + "code": 0, + "err_msg": "", + "data": "" + } diff --git a/sysom_server/sysom_colocation/app/schemas.py b/sysom_server/sysom_colocation/app/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..7d42e4d1a801aeac40739172ffab97bb1464398a --- /dev/null +++ b/sysom_server/sysom_colocation/app/schemas.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File schemas.py +Description: +""" +from pydantic import BaseModel +from datetime import datetime + +########################################################################### +# Define schemas here +########################################################################### + +# @reference https://fastapi.tiangolo.com/zh/tutorial/response-model/ +# class Person(BaseModel): +# id: int +# name: str +# age: int +# created_at: datetime + +# class Config: +# orm_mode = True \ No newline at end of file diff --git a/sysom_server/sysom_colocation/app/worker/cpi_worker.py b/sysom_server/sysom_colocation/app/worker/cpi_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..2112cfcf2860d6acfdf1df0c5128e65f1dd61568 --- /dev/null +++ b/sysom_server/sysom_colocation/app/worker/cpi_worker.py @@ -0,0 +1,131 @@ +from datetime import datetime +from time import time, sleep +import conf.settings as settings +from multiprocessing import Process +from schedule import Scheduler +from os import getpid +from conf.common import PROMETHEUS_DATABASE_URL +from sysom_utils import SysomFramework +from clogger import logger +from metric_reader import dispatch_metric_reader +from lib.table import ContainerPmuEvents +from lib.pod_state import pod_mgr +from lib.utils import ( + collect_all_clusters, + collect_instances_of_cluster, + generate_unique_key, +) +import uuid + + +class InstanceCPIChecker: + def __init__(self, cluster: str, instance: str, metric_reader) -> None: + self._cluster = cluster + self._instance = instance + self._table = ContainerPmuEvents(metric_reader, self._cluster, self._instance) + self._last_check = time() - settings.DEFAULT_FIRST_CHECK_INTERVAL + + def _check_all_pod_cpi(self): + now = time() + container_list = self._table.query_lastest_cpi( + start_time=self._last_check, end_time=now + ) + for item in container_list: + for data in item["data"]: + ts = float(data[0]) + cpi = float(data[1]) + alarm = pod_mgr.cpi2_alarm( + self._cluster, + self._instance, + item["ns"], + item["pod"], + item["con"], + cpi, + ts, + ) + if alarm is not None: + cpi_mean, cpi_stddev = alarm + logger.info( + f"[CPI2] Alarm cluster:{self._cluster}, instance:{self._instance}, pod:{item['ns']}, {item['pod']}, container:{item['con']}, cpi:{cpi}, ts:{ts}. mean={cpi_mean} stddev={cpi_stddev}" + ) + alert_id = str(uuid.uuid4()) + SysomFramework.alarm( + { + "alert_id": alert_id, + "instance": self._instance, + "alert_item": "CPI2 Anormal", + "alert_category": "MONITOR", + "alert_source_type": "SysOM", + "alert_time": int(round(time() * 1000)), + "status": "FIRING", + "alert_level": "WARNING", + "labels": {}, + "annotations": { + "summary": f"5分钟内 {item['ns']} {item['pod']} {item['con']}, 出现三次异常CPI 可能存在oncpu干扰", + "SYSOM_ALARM:OPT:sysom_diagnose:colocation_cpi_diagnose": { + "label": "根因分析", + "type": "LINK", + "url": f"/diagnose/colocation/cpi?instance={self._instance}&moment={datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')}", + }, + }, + } + ) + self._last_check = now + + def call(self): + self._check_all_pod_cpi() + + +class CPIWorker(Process): + def __init__(self, interval_sec: int = 15) -> None: + super().__init__(daemon=True) + self.interval_sec = interval_sec + self.scheduler: Scheduler = Scheduler() + self.current_pid = getpid() + self.metric_reader = dispatch_metric_reader(PROMETHEUS_DATABASE_URL) + self.instances = {} + + def _check_instance(self) -> None: + cluster_list = collect_all_clusters(self.metric_reader) + + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or settings.NO_CLUSTER_LABEL: + cluster_list.append("default") + + for cluster in cluster_list: + instance_list = collect_instances_of_cluster( + cluster, self.metric_reader, 60 + ) + for instance in instance_list: + unique_key = generate_unique_key(cluster, instance) + if unique_key in self.instances.keys(): + continue + logger.info( + f"create new InstanceCPIChecker cluster is {cluster} instance is {instance}" + ) + self.instances[unique_key] = InstanceCPIChecker( + cluster, instance, self.metric_reader + ) + + def _update(self) -> None: + logger.debug(f"CPI Worker alive...") + self._check_instance() + for ins in self.instances.values(): + ins.call() + + def run(self) -> None: + logger.info(f"CPI Worker running on pid {self.current_pid}") + + self._update() + self.scheduler.every(self.interval_sec).seconds.do(self._update) + + while True: + if self.is_alive(): + try: + self.scheduler.run_pending() + except Exception as e: + logger.error(f"CPI Worker error: {e}") + finally: + sleep(max(1, int(self.interval_sec / 2))) + else: + break diff --git a/sysom_server/sysom_colocation/app/worker/instance_manager.py b/sysom_server/sysom_colocation/app/worker/instance_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..05d2cba9a0860ba9d6b22a6f4d59516795c9e378 --- /dev/null +++ b/sysom_server/sysom_colocation/app/worker/instance_manager.py @@ -0,0 +1,358 @@ +from metric_reader.metric_reader import MetricReader +from sysom_utils import SysomFramework +from time import time +import conf.settings as settings +from lib.table import * +from lib.algo import PeriodAlgo +from clogger import logger +from lib.utils import generate_unique_key, ts_2_hour +import json + + +class MetricAdpater: + def __init__( + self, + predict_table: PredictTable, + allocation_table: AllocationTable, + algo: PeriodAlgo, + rt_gcache: SysomFramework.gcache, + future_gcache: SysomFramework.gcache, + ) -> None: + self._predict_table = predict_table + self._allocation_table = allocation_table + self._algo = algo + self._rt_gcache = rt_gcache + self._future_gcache = future_gcache + + def _generate_key(self, *arg, **kwargs) -> str: + return generate_unique_key( + self._predict_table._cluster, + self._predict_table._instance, + self._predict_table._resource, + self._predict_table._tag, + *arg, + **kwargs, + ) + + def _generate_rt_value(self, category, value) -> str: + values = { + "cluster": self._predict_table._cluster, + "instance": self._predict_table._instance, + "resource": self._predict_table._resource, + "tag": self._predict_table._tag, + "category": category, + "value": value, + } + return json.dumps(values) + + def _generate_future_value(self, category, value, future) -> str: + values = { + "cluster": self._predict_table._cluster, + "instance": self._predict_table._instance, + "resource": self._predict_table._resource, + "tag": self._predict_table._tag, + "category": category, + "future": future, + "value": value, + } + return json.dumps(values) + + def future_call(self) -> None: + if self._future_gcache is None: + return + + logger.debug("future_call is running...") + allocation = None + # calculate the future watermark(slack and predict) + future_watermark = self._algo.predict_future() + if self._allocation_table is not None: + allocation = self._allocation_table.query_allocation() + if allocation is None: + logger.error( + f"Query resource allocation failed. table_name = {self._allocation_table._table_name}" + ) + + current = ts_2_hour(time()) + for offset in range(24): + index = (current + offset) % 24 + watermark = future_watermark[index] + logger.debug( + f"store the future watermark offset={offset} watermark={watermark}..." + ) + self._future_gcache.store( + self._generate_key(settings.CATEGORY_PERDICT, str(offset)), + self._generate_future_value( + settings.CATEGORY_PERDICT, watermark, offset + ), + ) + if allocation is None: + continue + slack = max(0, allocation - watermark) * settings.SLACK_FACTOR + self._future_gcache.store( + self._generate_key(settings.CATEGORY_SLACK, str(offset)), + self._generate_future_value(settings.CATEGORY_SLACK, slack, offset), + ) + + def rt_call(self, start_time: float, end_time: float) -> None: + df = self._predict_table.range_query(start_time, end_time) + for _, row in df.iterrows(): + self._algo.learn(row["ts"], row["metric"]) + util = self._algo.predict(time()) + self._rt_gcache.store( + self._generate_key(settings.CATEGORY_PERDICT), + self._generate_rt_value(settings.CATEGORY_PERDICT, util), + ) + + # calculate the slack resource. only for LS. + if self._allocation_table is not None: + allocation = self._allocation_table.query_allocation() + if allocation is None: + logger.error( + f"Query resource allocation failed. table_name = {self._allocation_table._table_name}" + ) + return + slack = max(allocation - util, 0) * settings.SLACK_FACTOR + + self._rt_gcache.store( + self._generate_key(settings.CATEGORY_SLACK), + self._generate_rt_value(settings.CATEGORY_SLACK, slack), + ) + + +class InstanceManager: + def __init__( + self, + cluster: str, + instance: str, + reader: MetricReader, + rt_gcache: SysomFramework.gcache, + future_gcache: SysomFramework.gcache, + ) -> None: + self._cluster = cluster + self._instance = instance + self._metric_reader = reader + self._rt_gcache = rt_gcache + self._future_gcache = future_gcache + self._last_rt_call = time() - settings.DEFAULT_HISTORY_SPAN_SECS + self._last_future_call = None + self._cpu_max = ProcCpusTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_CPU, + settings.TAG_ALL, + ).query_max() + self._mem_max = ProcMeminfoTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_ALL, + -1, + ).query_max() + + if self._cpu_max is None or self._mem_max is None: + logger.error(f"InstanceManager init failed. get max failed.") + self._adapters = [] + self.init_adapters() + + def default_algo(self) -> PeriodAlgo: + return PeriodAlgo( + settings.ALGO_WINDOW_MINUTES, + settings.ALGO_SLOT_INTERVAl, + settings.ALGO_HALF_LIFE, + ) + + def init_adapters(self) -> None: + self.create_cpu_metric_adapters() + self.create_mem_metric_adapters() + + def create_mem_metric_adapters(self) -> None: + # Node Memory Predict + self._adapters.append( + MetricAdpater( + ProcMeminfoTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_ALL, + self._mem_max, + ), + None, + self.default_algo(), + self._rt_gcache, + self._future_gcache, + ) + ) + + # Ls Memory Predict & Ls Memory Slack + self._adapters.append( + MetricAdpater( + ContainerMemutilTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_LS, + self._mem_max, + settings.TAG_LS, + ), + ContainerMemutilTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_LS, + self._mem_max, + settings.TAG_LS, + ), + self.default_algo(), + self._rt_gcache, + self._future_gcache, + ) + ) + + # BE Memory Predict + self._adapters.append( + MetricAdpater( + ContainerMemutilTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_BE, + self._mem_max, + settings.TAG_BE, + ), + None, + self.default_algo(), + self._rt_gcache, + None, + ) + ) + + # NOR Memory Predict + self._adapters.append( + MetricAdpater( + ContainerMemutilTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_MEMORY, + settings.TAG_NOR, + self._mem_max, + settings.TAG_NOR, + ), + None, + self.default_algo(), + self._rt_gcache, + None, + ) + ) + + def create_cpu_metric_adapters(self) -> None: + # Node CPU + self._adapters.append( + MetricAdpater( + ProcCpuTotalTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_CPU, + settings.TAG_ALL, + 100, + ), + None, + self.default_algo(), + self._rt_gcache, + self._future_gcache, + ) + ) + # LS CPU + self._adapters.append( + MetricAdpater( + ContainerCpuacctStatTable( + self._metric_reader, + self._cluster, + self._instance, + settings.TAG_LS, + settings.RESOURCE_CPU, + settings.TAG_LS, + self._cpu_max, + ), + ContainerCfsQuotaTable( + self._metric_reader, + self._cluster, + self._instance, + settings.RESOURCE_CPU, + settings.TAG_LS, + self._cpu_max, + ), + self.default_algo(), + self._rt_gcache, + self._future_gcache, + ) + ) + + # BE CPU + self._adapters.append( + MetricAdpater( + ContainerCpuacctStatTable( + self._metric_reader, + self._cluster, + self._instance, + settings.TAG_BE, + settings.RESOURCE_CPU, + settings.TAG_BE, + self._cpu_max, + ), + None, + self.default_algo(), + self._rt_gcache, + None, + ) + ) + + # NOR CPU + self._adapters.append( + MetricAdpater( + ContainerCpuacctStatTable( + self._metric_reader, + self._cluster, + self._instance, + settings.TAG_NOR, + settings.RESOURCE_CPU, + settings.TAG_NOR, + self._cpu_max, + ), + None, + self.default_algo(), + self._rt_gcache, + None, + ) + ) + + def call(self) -> None: + """ This function will notify all adapter get the data in range and feed to model.\ + than predict current watermark. \ + If it's the first evoke, it will try learn the history data. + """ + logger.debug("InstanceManager call running...") + now = time() + for adapter in self._adapters: + start = self._last_rt_call + end = start + settings.SEC_PER_DAY + while True: + adapter.rt_call(start_time=start, end_time=min(end, now)) + if end > now: + break + start = end + end = start + settings.SEC_PER_DAY + self._last_rt_call = now + + current = ts_2_hour(time()) + if self._last_future_call is None or self._last_future_call != current: + for adapter in self._adapters: + adapter.future_call() + self._last_future_call = current diff --git a/sysom_server/sysom_colocation/app/worker/period_predict_worker.py b/sysom_server/sysom_colocation/app/worker/period_predict_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..c062bf19c69eeeb327e66ac7110f3a7b4854fa64 --- /dev/null +++ b/sysom_server/sysom_colocation/app/worker/period_predict_worker.py @@ -0,0 +1,73 @@ +import time +import conf.settings as settings +from multiprocessing import Process +from schedule import Scheduler +from os import getpid +from conf.common import PROMETHEUS_DATABASE_URL +from sysom_utils import SysomFramework +from clogger import logger +from metric_reader import dispatch_metric_reader +from lib.utils import ( + collect_all_clusters, + collect_instances_of_cluster, + generate_unique_key, +) +from .instance_manager import InstanceManager + + +class PeriodPredictWorker(Process): + def __init__(self, interval_sec: int = 15) -> None: + super().__init__(daemon=True) + self.interval_sec = interval_sec + self.scheduler: Scheduler = Scheduler() + self.current_pid = getpid() + self.metric_reader = dispatch_metric_reader(PROMETHEUS_DATABASE_URL) + self.instances = {} + + def _check_instance(self) -> None: + cluster_list = collect_all_clusters(self.metric_reader) + + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or settings.NO_CLUSTER_LABEL: + cluster_list.append("default") + + for cluster in cluster_list: + instance_list = collect_instances_of_cluster( + cluster, self.metric_reader, 60 + ) + for instance in instance_list: + unique_key = generate_unique_key(cluster, instance) + if unique_key in self.instances.keys(): + continue + logger.info( + f"create new InstanceManager cluster is {cluster} instance is {instance}" + ) + self.instances[unique_key] = InstanceManager( + cluster, + instance, + self.metric_reader, + SysomFramework.gcache(settings.CACHE_RT), + SysomFramework.gcache(settings.CACHE_FUTURE), + ) + + def _update(self) -> None: + self._check_instance() + for ins in self.instances.values(): + ins.call() + + def run(self) -> None: + logger.info(f"PeriodModleWorker running on pid {self.current_pid}") + + self._update() + self.scheduler.every(self.interval_sec).seconds.do(self._update) + + while True: + if self.is_alive(): + try: + self.scheduler.run_pending() + except Exception as e: + logger.error(f"PeriodModleWorker error {e}") + finally: + time.sleep(max(1, int(self.interval_sec / 2))) + else: + break diff --git a/sysom_server/sysom_colocation/app/worker/serve_util_worker.py b/sysom_server/sysom_colocation/app/worker/serve_util_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..8b82c9de2c73b2b7f9f29d280992d528633719fc --- /dev/null +++ b/sysom_server/sysom_colocation/app/worker/serve_util_worker.py @@ -0,0 +1,130 @@ +from datetime import datetime +from time import time, sleep +import conf.settings as settings +from multiprocessing import Process +from schedule import Scheduler +from os import getpid +from conf.common import PROMETHEUS_DATABASE_URL +from sysom_utils import SysomFramework +from clogger import logger +from metric_reader import dispatch_metric_reader +from lib.table import ContaierCfsStatistics +from lib.pod_state import pod_mgr +from lib.utils import ( + collect_all_clusters, + collect_instances_of_cluster, + generate_unique_key, +) +import uuid + + +class InstanceServeUtilChecker: + def __init__(self, cluster: str, instance: str, metric_reader) -> None: + self._cluster = cluster + self._instance = instance + self._table = ContaierCfsStatistics(metric_reader, cluster, instance) + self._last_check = time() - settings.DEFAULT_FIRST_CHECK_INTERVAL + + def _check_all_pod_serve_util(self): + now = time() + container_list = self._table.query_serve_util_rate(self._last_check, now) + for item in container_list: + for val in item["serveutil"]: + ts = float(val[0]) + util = float(val[1]) + if util == 0: + # util is zero means that current system not support the metric + continue + alarm = pod_mgr.cfs_alarm( + self._cluster, + self._instance, + item["ns"], + item["pod"], + item["con"], + util, + ) + if alarm is not None: + logger.info( + f"[ServeUtil] check Serveutil alarm: ts={ts} pod={item['ns']}-{item['pod']}-container:{item['con']} serveutil={util}." + ) + alert_id = str(uuid.uuid4()) + SysomFramework.alarm( + { + "alert_id": alert_id, + "instance": self._instance, + "alert_item": "Service Utilization Anormal", + "alert_category": "MONITOR", + "alert_source_type": "SysOM", + "alert_time": int(round(time() * 1000)), + "status": "FIRING", + "alert_level": "WARNING", + "labels": {}, + "annotations": { + "summary": f"{item['ns']}-{item['pod']}-{item['con']} CFS满足率较低, 可能存在on queue干扰", + "SYSOM_ALARM:OPT:sysom_diagnose:colocation_serveutil_diagnose": { + "label": "根因分析", + "type": "LINK", + "url": f"/diagnose/colocation/serveutil?instance={self._instance}&moment={datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')}", + }, + }, + } + ) + + logger.debug(f"Serveutl alarm add action: alert_id={alert_id}") + + self._last_check = now + + def call(self): + self._check_all_pod_serve_util() + + +class ServeUtilWorker(Process): + def __init__(self, interval_sec: int = 15) -> None: + super().__init__(daemon=True) + self.interval_sec = interval_sec + self.scheduler: Scheduler = Scheduler() + self.current_pid = getpid() + self.metric_reader = dispatch_metric_reader(PROMETHEUS_DATABASE_URL) + self.instances = {} + + def _check_instance(self) -> None: + cluster_list = collect_all_clusters(self.metric_reader) + + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or settings.NO_CLUSTER_LABEL: + cluster_list.append("default") + + for cluster in cluster_list: + instance_list = collect_instances_of_cluster( + cluster, self.metric_reader, 60 + ) + for instance in instance_list: + unique_key = generate_unique_key(cluster, instance) + if unique_key in self.instances.keys(): + continue + + self.instances[unique_key] = InstanceServeUtilChecker( + cluster, instance, self.metric_reader + ) + + def _update(self) -> None: + logger.debug(f"ServeUtil Worker alive...") + self._check_instance() + for ins in self.instances.values(): + ins.call() + + def run(self) -> None: + logger.info(f"ServeUtil Worker running on pid {self.current_pid}") + self._update() + self.scheduler.every(self.interval_sec).seconds.do(self._update) + + while True: + if self.is_alive(): + try: + self.scheduler.run_pending() + except Exception as e: + logger.error(f"ServeUtil Worker error: {e}") + finally: + sleep(max(1, int(self.interval_sec / 2))) + else: + break diff --git a/sysom_server/sysom_colocation/app/worker/sys_worker.py b/sysom_server/sysom_colocation/app/worker/sys_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..0a62e7c71bed17c38447bdaafc4fddf8f9740925 --- /dev/null +++ b/sysom_server/sysom_colocation/app/worker/sys_worker.py @@ -0,0 +1,123 @@ +from time import time, sleep +import conf.settings as settings +from multiprocessing import Process +from schedule import Scheduler +from os import getpid +from conf.common import PROMETHEUS_DATABASE_URL +from sysom_utils import SysomFramework +from clogger import logger +from metric_reader import dispatch_metric_reader +from lib.table import ContainerCpuacctStatTable +from lib.pod_state import pod_mgr +from lib.utils import ( + collect_all_clusters, + collect_instances_of_cluster, + generate_unique_key, +) +import uuid + + +class InstanceSysChecker: + def __init__(self, cluster: str, instance: str, metric_reader) -> None: + self._cluster = cluster + self._instance = instance + self._table = ContainerCpuacctStatTable( + metric_reader, self._cluster, self._instance, None, None, None, None + ) + self._last_check = time() - settings.DEFAULT_FIRST_CHECK_INTERVAL + + def _check_all_pod_sys(self): + now = time() + pod_list = self._table.query_cpu_time(self._last_check, now) + for pod in pod_list: + for user, sys in zip(pod["user"], pod["system"]): + alarm = pod_mgr.sys_alarm( + self._cluster, + self._instance, + pod["ns"], + pod["pod"], + pod["con"], + float(sys[1]), + float(user[1]), + ) + if alarm is not None: + # quota = alarm + logger.info( + f"[SysWorker] check sys alarm: ts={time()} pod={pod['ns']}-{pod['pod']}-container:{pod['con']} sys={float(sys[1])} user={float(user[1])}." + ) + SysomFramework.alarm( + { + "alert_id": str(uuid.uuid4()), + "instance": self._instance, + "alert_item": "System Time Anormal", + "alert_category": "MONITOR", + "alert_source_type": "SysOM", + "alert_time": int(round(time() * 1000)), + "status": "FIRING", + "alert_level": "WARNING", + "labels": {}, + "annotations": { + "summary": f"{pod['ns']}-{pod['pod']}-{pod['con']} System时间占比偏高, 可能存在oncpu干扰" + }, + } + ) + + self._last_check = now + + def call(self): + self._check_all_pod_sys() + + +class SysWorker(Process): + def __init__(self, interval_sec: int = 15) -> None: + super().__init__(daemon=True) + self.interval_sec = interval_sec + self.scheduler: Scheduler = Scheduler() + self.current_pid = getpid() + self.metric_reader = dispatch_metric_reader(PROMETHEUS_DATABASE_URL) + self.instances = {} + + def _check_instance(self) -> None: + cluster_list = collect_all_clusters(self.metric_reader) + + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or settings.NO_CLUSTER_LABEL: + cluster_list.append("default") + + for cluster in cluster_list: + instance_list = collect_instances_of_cluster( + cluster, self.metric_reader, 60 + ) + for instance in instance_list: + unique_key = generate_unique_key(cluster, instance) + if unique_key in self.instances.keys(): + continue + logger.info( + f"create new InstanceSysChecker cluster is {cluster} instance is {instance}" + ) + self.instances[unique_key] = InstanceSysChecker( + cluster, instance, self.metric_reader + ) + + def _update(self) -> None: + logger.debug(f"Sys Worker alive...") + self._check_instance() + for ins in self.instances.values(): + ins.call() + + def run(self) -> None: + logger.info(f"Sys Worker running on pid {self.current_pid}") + + self._update() + self.scheduler.every(self.interval_sec).seconds.do(self._update) + + while True: + if self.is_alive(): + try: + self.scheduler.run_pending() + except Exception as e: + logger.error(f"Sys Worker error: {e}") + finally: + sleep(max(1, int(self.interval_sec / 2))) + else: + break diff --git a/sysom_server/sysom_colocation/conf/colocation_settings.py b/sysom_server/sysom_colocation/conf/colocation_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..d4d5801cb0bca05ff0c9e1be60afa483f0ba084f --- /dev/null +++ b/sysom_server/sysom_colocation/conf/colocation_settings.py @@ -0,0 +1,88 @@ +################################################################################# +# Cache Settings +################################################################################# + +# the realtime predict metric cache +CACHE_RT = "colocation_instance_rt_cache" +# the future predict metric cache +CACHE_FUTURE = "colocation_instance_future_cache" + +################################################################################# +# Table Settings +################################################################################# + +# calculate the cpu usage by sum the labels metric +PORC_CPU_USGAE = ["softirq", "user", "sys", "hardirq", "nice"] + +# the table name +TABLE_PROC_CPU_TOTAL = "sysom_proc_cpu_total" +TABLE_CONTAINER_CPUACCT_STAT = "sysom_container_cpuacct_stat" +TABLE_CONTAINER_CFS_QUOTA = "sysom_container_cfs_quota" +TABLE_PROC_CPUS = "sysom_proc_cpus" +TABLE_PROC_MEMINFO = "sysom_proc_meminfo" +TABLE_CONTAINER_MEMUTIL = "sysom_container_memUtil" +TABLE_CONTANINER_PMU_EVENTS = "sysom_container_pmu_events" +TABLE_CONTANINER_CFS_STATISTICS = "sysom_container_cfs_statis" + +# the metric label +CLUSTER_LABEL = "cluster" +NODE_LABEL = "instance" +POD_LABEL = "pod" + +# the default return value when range_query get no data or exception +DEFAULT_PREDICT_QUERY_RESULT = {"ts": [], "metric": []} +DEFAULT_MAX_RESULT = None +DEFAULT_AGG_RESULT = None +DEFAULT_ALLOCAITON_RESULT = None + +################################################################################# +# Model Settings +################################################################################# + +SEC_PER_MINUTE = 60 +SEC_PER_HOUR = 60 * SEC_PER_MINUTE +# seconds per day +SEC_PER_DAY = 24 * SEC_PER_HOUR +# first init model will learn DEFAULT_HISTORY_SPAN_SECS hiistory data. +DEFAULT_HISTORY_SPAN_SECS = 14 * SEC_PER_DAY +DEFAULT_HISTORY_MAX_UPDATE_INTERVAl = SEC_PER_HOUR +DEFAULT_FIRST_CHECK_INTERVAL = 10 * SEC_PER_MINUTE +DEFAULT_CPI2_PERIOD = 5 * SEC_PER_MINUTE +DEFAULT_CPI2_ANORMAL_THRESHOLD = 3 +# predict resource type +RESOURCE_CPU = "CPU" +RESOURCE_MEMORY = "MEMORY" +RESOURCE_IO = "IO" + +# precit recource level +TAG_LS = "LS" +TAG_BE = "BE" +TAG_NOR = "NOR" +TAG_ALL = "ALL" +TAG_OTHER = "OTHER" + +# predict category +CATEGORY_PERDICT = "predict" +CATEGORY_SLACK = "slack" +CATEGORY_REAL = "real" + +# period predict algo argument, control the nr_window +ALGO_WINDOW_MINUTES = 60 +# control the nr_slot +ALGO_SLOT_INTERVAl = 0.1 +# control the history contribute +ALGO_HALF_LIFE = 1024 + +################################################################################# +# Cluster Settings +################################################################################# +# if NO_CLUSTER_LABEL is true, we consider there is one cluster +NO_CLUSTER_LABEL = True + +################################################################################# +# Slack Settings +################################################################################# +SLACK_FACTOR = 0.8 +CPU_MAX_ALLOCATION_PRECENT = 60 +MEM_MAX_ALLOCATION_PERCENT = 85 +POD_UNLIMITED_FACTOR = 1.01 diff --git a/sysom_server/sysom_colocation/conf/common.py b/sysom_server/sysom_colocation/conf/common.py new file mode 100644 index 0000000000000000000000000000000000000000..67170150cac41bd2fcd6084286b646bd742b0fb3 --- /dev/null +++ b/sysom_server/sysom_colocation/conf/common.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File common.py +Description: +""" +from pathlib import Path +from sysom_utils import ConfigParser, SysomFramework + +BASE_DIR = Path(__file__).resolve().parent.parent + +################################################################## +# Load yaml config first +################################################################## +YAML_GLOBAL_CONFIG_PATH = f"{BASE_DIR.parent.parent}/conf/config.yml" +YAML_SERVICE_CONFIG_PATH = f"{BASE_DIR}/config.yml" + +YAML_CONFIG = ConfigParser(YAML_GLOBAL_CONFIG_PATH, YAML_SERVICE_CONFIG_PATH) + +mysql_config = YAML_CONFIG.get_server_config().db.mysql +prometheus_config = YAML_CONFIG.get_server_config().db.prometheus +service_config = YAML_CONFIG.get_service_config() + +SysomFramework.init(YAML_CONFIG) + +################################################################## +# fastapi config +################################################################## +SQLALCHEMY_DATABASE_URL = ( + f"{mysql_config.dialect}+{mysql_config.engine}://{mysql_config.user}:{mysql_config.password}@" + f"{mysql_config.host}:{mysql_config.port}/{mysql_config.database}" +) + +PROMETHEUS_DATABASE_URL = f"prometheus://{prometheus_config.host}:{prometheus_config.port}" + diff --git a/sysom_server/sysom_colocation/conf/develop.py b/sysom_server/sysom_colocation/conf/develop.py new file mode 100644 index 0000000000000000000000000000000000000000..c4de7a4cf79c929dc6d9d65e77a3cff1cf056364 --- /dev/null +++ b/sysom_server/sysom_colocation/conf/develop.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File develoop.py +Description: +""" +from .common import * + +''' +开发环境配置项 +''' + +DEBUG = True \ No newline at end of file diff --git a/sysom_server/sysom_colocation/conf/gunicorn.py b/sysom_server/sysom_colocation/conf/gunicorn.py new file mode 100644 index 0000000000000000000000000000000000000000..03eb77e1d28ef54050e788221557a21c7e1372dc --- /dev/null +++ b/sysom_server/sysom_colocation/conf/gunicorn.py @@ -0,0 +1,23 @@ +''' +Channel Service Gunicorn Settings +''' +from conf.common import YAML_CONFIG + +bind = YAML_CONFIG.get_service_config().get("bind", "127.0.0.1") +port = YAML_CONFIG.get_service_config().get("port", "80") + +workers = 2 # 指定工作进程数 + +threads = 3 + +bind = f'{bind}:{port}' + +worker_class = 'uvicorn.workers.UvicornWorker' # 工作模式线程, 默认为sync模式 + +max_requests = 2000 # 设置最大并发数量为2000 (每个worker处理请求的工作线程) + +accesslog = '/var/log/sysom/sysom-colocation-access.log' + +loglevel = 'error' + +proc_name = 'sysom_colocation_service' diff --git a/sysom_server/sysom_colocation/conf/product.py b/sysom_server/sysom_colocation/conf/product.py new file mode 100644 index 0000000000000000000000000000000000000000..981f7c8d6422da6c78e60477fa0a103f5adbbafd --- /dev/null +++ b/sysom_server/sysom_colocation/conf/product.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File product.py +Description: +""" +from .common import * + +''' +生产环境配置项 +''' + +DEBUG = False diff --git a/sysom_server/sysom_colocation/conf/settings.py b/sysom_server/sysom_colocation/conf/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..13be56ef8ccbbaf17c984c0dde7f50a15d77723f --- /dev/null +++ b/sysom_server/sysom_colocation/conf/settings.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File settings.py +Description: +""" +import os + +env = os.environ.get("env", "product") + + +if env == "develop": + from .develop import * +elif env == "testing": + from .testing import * +elif env == "product": + from .product import * + + +from .colocation_settings import * \ No newline at end of file diff --git a/sysom_server/sysom_colocation/conf/testing.py b/sysom_server/sysom_colocation/conf/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..38051d182def0e9eb4b82fa54d54755af068e6d0 --- /dev/null +++ b/sysom_server/sysom_colocation/conf/testing.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File testing.py +Description: +""" +from .common import * + +''' +测试环境配置项 +''' +DEBUG = True diff --git a/sysom_server/sysom_colocation/config.yml b/sysom_server/sysom_colocation/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4e7e36b84426ebae086bfedc854dd6243d4f4136 --- /dev/null +++ b/sysom_server/sysom_colocation/config.yml @@ -0,0 +1,35 @@ +vars: + SERVICE_NAME: &SERVICE_NAME sysom_colocation + SERVICE_CONSUMER_GROUP: + !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] + +sysom_server: + cec: + consumer_group: *SERVICE_CONSUMER_GROUP + +sysom_service: + service_name: *SERVICE_NAME + service_dir: *SERVICE_NAME + protocol: http + host: 127.0.0.1 + bind: 127.0.0.1 + port: 7022 + framework: + gcache: + protocol: redis + node_dispatch: + cmg: + tags: + - colocation + - FastApi + # Metadata of service + metadata: + check: + type: http + url: "/api/v1/colocation/health/check" + interval: 10 + timeout: 10 + deregister: 25 + header: + tls_skip_verify: false + diff --git a/sysom_server/sysom_colocation/lib/README.md b/sysom_server/sysom_colocation/lib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ec74424ba6ecde5d035d0c113bafb0ddc6e4cfa --- /dev/null +++ b/sysom_server/sysom_colocation/lib/README.md @@ -0,0 +1 @@ +The current directory holds the public libraries or utils needed for microservices \ No newline at end of file diff --git a/sysom_server/sysom_colocation/lib/algo.py b/sysom_server/sysom_colocation/lib/algo.py new file mode 100644 index 0000000000000000000000000000000000000000..4ab5035972bf72352e3166b6dbc9852f1c70127d --- /dev/null +++ b/sysom_server/sysom_colocation/lib/algo.py @@ -0,0 +1,181 @@ +import numpy as np +import pandas as pd + +from .utils import ts_2_hour +from .algo_stat import Stat +import math +from clogger import logger + + +class Algorithm: + def __init__(self) -> None: + pass + + def learn(self, ts: float, util: float) -> None: + pass + + def predict(self, ts: float) -> float: + pass + + def report(self) -> None: + pass + + +class PeriodAlgo(Algorithm): + def __init__(self, window_minutes: int, interval: int, half_life: int) -> None: + self._window_minutes = window_minutes + self._interval = interval + self._nr_window = int((24 * 60) / window_minutes) + self._slots = int(100 / interval) + self._model = np.zeros(self._nr_window * self._slots).reshape( + self._nr_window, self._slots + ) + + self._decay_factor = math.pow(1 / 2, 1 / half_life) + # self._table = np.logspace( + # start=0, stop=half_life, endpoint=True, base=self._decay_factor + # ) + self._cur_window = -1 + self._reset() + self._train_times = 0 + self._slot_stats = [Stat() for i in range(self._nr_window)] + self._stat = Stat() + self._ts = 0 + self._history = [[] for i in range(self._nr_window)] + logger.info( + f"window_minutes={self._window_minutes} interval={self._interval} window={self._nr_window} slots={self._slots} decay_coefficient={self._decay_factor}" + ) + + def _reset(self) -> None: + self._buffer = [] + + def _update(self, window: int, util: float, ts: float) -> None: + self._buffer.append(util) + self._cur_window = window + self._ts = ts if ts > self._ts else self._ts + + def _quantile(self): + return pd.Series(self._buffer).quantile(0.95) + + def _avg(self): + return pd.Series(self._buffer).mean() + + def _max(self): + return pd.Series(self._buffer).max() + + def _record_window_avg(self) -> None: + if self._cur_window == -1: + return + util = self._quantile() + real = int(util / self._interval) + self._history[self._cur_window].append(util) + self._model[self._cur_window] *= self._decay_factor + self._model[self._cur_window][real] += 1 + self._train_times += 1 + + def _record(self, window, expect, real) -> None: + self._stat._record(expect, real) + self._slot_stats[window]._record(expect, real) + + def train(self, arr: np.ndarray) -> None: + if arr.shape[0] != self._nr_window: + print("Invalid train data.") + return + + for window, series in enumerate(arr): + for util in series: + self.learn(window, util, 1) + + def predict(self, ts: float) -> float: + window = ts_2_hour(ts) + expect = self._model.argmax(axis=1)[window] + result = expect * self._interval + logger.debug(f"model predict ts={ts} window={window} result={result}") + return result + + def learn(self, ts: float, util: float) -> None: + window = ts_2_hour(ts) + if ts < self._ts: + logger.error("occur coming data is older than model.") + return + if self._cur_window != window: + self._record_window_avg() + self._reset() + self._update(window, util, ts) + + def report(self) -> None: + for i in range(self._nr_window): + logger.debug( + f"predict window={i} util={self.predict(self._window_minutes * i * 60 + 16*60*60):.1f}" + ) + # self._stat.report() + + def predict_future(self): + future_watermark = self._model.argmax(axis=1) * self._interval + return future_watermark + + +class RtAlgo(Algorithm): + def __init__(self, interval, half_life: int) -> None: + self._interval = interval + self._decay_factor = math.pow(1 / 2, 1 / half_life) + self._model = np.zeros(int(100.0 / self._interval)) + self._stat = Stat() + + def learn(self, ts: float, util: float) -> None: + index = int(util / self._interval) + self._model *= self._decay_factor + self._model[index] += 1 + + def predict(self, ts: float) -> float: + expect = self._model.argmax() * self._interval + return expect + + def _record(self, expect, real) -> None: + self._stat._record(expect, real) + + def report(self) -> None: + self._stat.report() + + +class MixAlgo: + def __init__(self) -> None: + self.rt_algo = RtAlgo(0.1, 4) + self.period_algo = PeriodAlgo(60, 0.1, 4096) + self._stat = Stat() + + def learn(self, window: int, util: float): + self.rt_algo.learn(util) + self.period_algo.learn(window, util) + + def predict(self, window: int, util: float) -> float: + rt_result = self.rt_algo.predict() + period_result = self.period_algo.predict(window) + if util is not None: + self.rt_algo._record(rt_result, util) + self.period_algo._record(window, period_result, util) + + factor = self.period_algo.slot_stats[window].avg_error() / ( + self.rt_algo.stat.avg_error() + + self.period_algo.slot_stats[window].avg_error() + ) + factor = 0 + mix_result = factor * rt_result + (1 - factor) * period_result + + print( + "rt={:.2f} period={:.2f} mix={:.2f} real={:.2f}".format( + rt_result, period_result, mix_result, util + ) + ) + return mix_result + + def _record(self, expect, real): + self._stat._record(expect, real) + + def report(self): + print("mix:") + self._stat.report() + print("rt:") + self.rt_algo.report() + print("period:") + self.period_algo.report() diff --git a/sysom_server/sysom_colocation/lib/algo_stat.py b/sysom_server/sysom_colocation/lib/algo_stat.py new file mode 100644 index 0000000000000000000000000000000000000000..d896535777c1416e96219aef92d23667918cacea --- /dev/null +++ b/sysom_server/sysom_colocation/lib/algo_stat.py @@ -0,0 +1,31 @@ + +class Stat: + def __init__(self) -> None: + self.all = 0 + self.succ = 0 + self.error = 0 + + def record(self, expect: float, real: float) -> None: + self.all += 1 + error = abs(expect - real) + if error != 0: + self.error += error + else: + self.succ += 1 + + def report(self) -> None: + if self.all > 0: + print( + "predict={} success={} rate={:.2f} error_avg={:.2f}".format( + self.all, + self.succ, + float(self.succ) / self.all, + float(self.error) / self.all, + ) + ) + + def avg_error(self) -> float: + if self.all > 0: + return float(self.error) / self.all + else: + return 0 diff --git a/sysom_server/sysom_colocation/lib/pod_state.py b/sysom_server/sysom_colocation/lib/pod_state.py new file mode 100644 index 0000000000000000000000000000000000000000..babd4becf2aadd3d434d682c424ed40274a17bfe --- /dev/null +++ b/sysom_server/sysom_colocation/lib/pod_state.py @@ -0,0 +1,420 @@ +from abc import abstractmethod +import threading +from time import time +from typing import List, Tuple +from lib.table import ( + ContaierCfsStatistics, + ContainerCfsQuotaTable, + ContainerCpuacctStatTable, + ContainerMemutilTable, + ContainerPmuEvents, +) +from metric_reader.metric_reader import dispatch_metric_reader, MetricReader +from conf.common import PROMETHEUS_DATABASE_URL +from lib.utils import generate_unique_key, get_today_zero_ts +import conf.settings as settings +from clogger import logger + + +def get_max_resource_interval(last_update_time: float): + interval = None + if last_update_time is None: + interval = "1d" + elif time() - last_update_time >= settings.SEC_PER_HOUR: + interval = "1h" + return interval + + +class PodMetric: + def __init__( + self, cluster: str, instance: str, ns: str, pod: str, container: str + ) -> None: + self._cluster = cluster + self._instance = instance + self._ns = ns + self._pod = pod + self._con = container + self._value = 0.0 + self._last_update_time = None + + @abstractmethod + def value(self) -> float: + if self._last_update_time is not None: + return self._value + else: + return None + + +class PodMaxCPU(PodMetric): + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self._cpu_table = ContainerCpuacctStatTable( + metric_reader, + cluster, + instance, + settings.TAG_OTHER, + settings.RESOURCE_CPU, + settings.TAG_OTHER, + 0, + ) + + def _update_cpu_max(self) -> None: + # get the query time interval + interval = get_max_resource_interval(self._last_update_time) + if interval is None: + return + + history_max_cpu = self._cpu_table.query_history_max( + self._ns, self._pod, self._con, self._last_update_time, interval + ) + if history_max_cpu is None: + return + + self._value = max(self._value, history_max_cpu) + self._last_update_time = time() + + def value(self) -> float: + self._update_cpu_max() + return super().value() + + +class PodMaxMem(PodMetric): + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self._mem_table = ContainerMemutilTable( + metric_reader, + cluster, + instance, + settings.RESOURCE_MEMORY, + settings.TAG_OTHER, + 0, + settings.TAG_OTHER, + ) + + def _update_mem_max(self) -> None: + # get the query time interval + interval = get_max_resource_interval(self._last_update_time) + if interval is None: + return + + history_max_mem = self._mem_table.query_history_max( + self._ns, self._pod, self._con, self._last_update_time, interval + ) + if history_max_mem is None: + return + self._value = max(self._value, history_max_mem) + self._last_update_time = time() + + def value(self) -> float: + self._update_mem_max() + return super().value() + + +class PodMeanCPI(PodMetric): + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self.pmu_table = ContainerPmuEvents(metric_reader, cluster, instance) + + def _update_mean_cpi(self) -> None: + end = get_today_zero_ts() + if ( + self._last_update_time is None + or (end - self._last_update_time) >= settings.SEC_PER_DAY + ): + mean_cpi = self.pmu_table.query_history_mean( + self._ns, self._pod, self._con, end, "1d" + ) + + if mean_cpi is not None: + self._value = mean_cpi + self._last_update_time = end + + def value(self) -> float: + self._update_mean_cpi() + return super().value() + + +class PodStddevCPI(PodMetric): + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self.pmu_table = ContainerPmuEvents(metric_reader, cluster, instance) + + def _update_stddev_cpi(self) -> None: + end = get_today_zero_ts() + if ( + self._last_update_time is None + or (end - self._last_update_time) >= settings.SEC_PER_DAY + ): + stddev_cpi = self.pmu_table.query_history_stddev( + self._ns, self._pod, self._con, end, "1d" + ) + + if stddev_cpi is not None: + self._value = stddev_cpi + self._last_update_time = end + + def value(self) -> float: + self._update_stddev_cpi() + return super().value() + + +class PodCpuQuota(PodMetric): + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self._cpu_table = ContainerCfsQuotaTable( + metric_reader, + cluster, + instance, + settings.RESOURCE_CPU, + settings.TAG_LS, + None, + ) + + def _update_quota(self) -> None: + now = time() + if ( + self._last_update_time is None + or (now - self._last_update_time) >= settings.SEC_PER_HOUR + ): + val = self._cpu_table.query_latest_quota_ratio( + self._ns, self._pod, self._con + ) + if val is not None: + self._value = val + self._last_update_time = now + + def value(self) -> float: + self._update_quota() + return super().value() + + +class PodCFS(PodMetric): + """P99 CFS satisfication of last week, update per hour""" + + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + PodMetric.__init__(self, cluster, instance, ns, pod, container) + self._cfs_table = ContaierCfsStatistics(metric_reader, cluster, instance) + + def _update_cfs(self) -> None: + now = time() + if ( + self._last_update_time is None + or (now - self._last_update_time) >= settings.SEC_PER_HOUR * 1 + ): + # query + val = self._cfs_table.query_p99_one_week( + self._ns, self._pod, self._con, now + ) + if val is not None: + self._value = val + self._last_update_time = now + + def value(self) -> float: + self._update_cfs() + return super().value() + + +class PodInfo: + def __init__( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + container: str, + metric_reader: MetricReader, + ) -> None: + self._cluster = cluster + self._instance = instance + self._ns = ns + self._pod = pod + self._con = container + self._max_cpu = PodMaxCPU(cluster, instance, ns, pod, container, metric_reader) + self._max_mem = PodMaxMem(cluster, instance, ns, pod, container, metric_reader) + self._stddev_cpi = PodStddevCPI( + cluster, instance, ns, pod, container, metric_reader + ) + self._mean_cpi = PodMeanCPI( + cluster, instance, ns, pod, container, metric_reader + ) + self._quota = PodCpuQuota(cluster, instance, ns, pod, container, metric_reader) + self._cfs = PodCFS(cluster, instance, ns, pod, container, metric_reader) + self.cpi_anormal_list = [] + + def query_history_max(self, resource: str) -> float: + if resource == settings.RESOURCE_CPU: + return self._max_cpu.value() + elif resource == settings.RESOURCE_MEMORY: + return self._max_mem.value() + else: + logger.error("Not supported resource type.") + + def remove_cpi_ts(self): + """remove the timeout CPI, we only count the last 5 mintes""" + now = time() + while len(self.cpi_anormal_list) != 0: + if now - self.cpi_anormal_list[0] > settings.DEFAULT_CPI2_PERIOD: + self.cpi_anormal_list.pop(0) + else: + break + + def cpi2_alarm(self, cur_cpi: float, ts: float) -> List[float]: + self.remove_cpi_ts() + cpi_stddev = self._stddev_cpi.value() + cpi_mean = self._mean_cpi.value() + if cpi_stddev is not None and cpi_mean is not None: + if cur_cpi > cpi_mean + 2 * cpi_stddev: + self.cpi_anormal_list.append(ts) + logger.info( + f"[cpi2_alarm] one abnormal{self._ns}-{self._pod}-{self._con} cur_cpi={cur_cpi} stddev={cpi_stddev} avg={cpi_mean}, pod={self._ns}-{self._pod}-{self._con}" + ) + else: + logger.error( + f"[cpi2_alarm] {self._ns}-{self._pod}-{self._con} stddev={cpi_stddev} avg={cpi_mean}, pod={self._ns}-{self._pod}-{self._con}" + ) + + if len(self.cpi_anormal_list) >= 3: + logger.info( + f"[cpi2_alarm] generate alarm. {self._ns}-{self._pod}-{self._con} cur_cpi={cur_cpi} stddev={cpi_stddev} avg={cpi_mean}, pod={self._ns}-{self._pod}-{self._con}" + ) + self.cpi_anormal_list.clear() + return cpi_mean, cpi_stddev + else: + return None + + def cfs_alarm(self, cur_cfs: float) -> float: + p99_val = self._cfs.value() + if p99_val is None: + logger.error(f"[cfs_alarm] {self._ns}-{self._pod}-{self._con} p99 is None.") + + return p99_val if cur_cfs < p99_val else None + + def sys_alarm(self, cur_sys: float, cur_user: float) -> float: + quota_val = self._quota.value() + + if quota_val is None: + logger.error( + f"[sys_alarm] {self._ns}-{self._pod}-{self._con} quota is None." + ) + return quota_val + if ( + cur_sys > 0.5 * quota_val + or ((cur_sys + cur_user) > 0.2 * quota_val and cur_sys / cur_user) > 1.5 + ): + return quota_val + else: + return None + + +class PodStateManager: + def __init__(self) -> None: + self._pod_dict = {} + self._metric_reader = dispatch_metric_reader(PROMETHEUS_DATABASE_URL) + self._lock = threading.Lock() + + def init_pod_info(self, cluster: str, instance: str, ns: str, pod: str, con: str): + key = generate_unique_key(cluster, instance, ns, pod, con) + if key not in self._pod_dict.keys(): + self._pod_dict[key] = PodInfo( + cluster, instance, ns, pod, con, self._metric_reader + ) + return key + + def query_max( + self, cluster: str, instance: str, ns: str, pod: str, con: str, resource: str + ) -> float: + with self._lock: + key = self.init_pod_info(cluster, instance, ns, pod, con) + return self._pod_dict[key].query_history_max(resource) + + def cpi2_alarm( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + con: str, + cpi: float, + ts: float, + ) -> Tuple[float, float]: + with self._lock: + key = self.init_pod_info(cluster, instance, ns, pod, con) + return self._pod_dict[key].cpi2_alarm(cpi, ts) + + def sys_alarm( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + con: str, + sys: float, + user: float, + ) -> float: + with self._lock: + key = self.init_pod_info(cluster, instance, ns, pod, con) + return self._pod_dict[key].sys_alarm(sys, user) + + def cfs_alarm( + self, + cluster: str, + instance: str, + ns: str, + pod: str, + con: str, + cfs: float, + ) -> float: + with self._lock: + key = self.init_pod_info(cluster, instance, ns, pod, con) + return self._pod_dict[key].cfs_alarm(cfs) + + +pod_mgr = PodStateManager() diff --git a/sysom_server/sysom_colocation/lib/table.py b/sysom_server/sysom_colocation/lib/table.py new file mode 100644 index 0000000000000000000000000000000000000000..89011d7d25cbdf0c762a90799c41d393a2df788f --- /dev/null +++ b/sysom_server/sysom_colocation/lib/table.py @@ -0,0 +1,742 @@ +from abc import abstractmethod +import traceback +from pandas.core.api import DataFrame as DataFrame +from metric_reader.metric_reader import MetricReader +from metric_reader.result import MetricResult +from metric_reader.task import InstantQueryTask, RangeQueryTask +import conf.settings as settings +from clogger import logger +import pandas as pd +from typing import List + + +def aggravate_pod(result: MetricResult): + time_series_list = [] + for item in result.data: + df = pd.DataFrame( + { + "ts": pd.to_datetime([int(val[0]) for val in item.values], unit="s"), + "value": [float(val[1]) for val in item.values], + } + ) + df.set_index("ts", inplace=True) + df = df.resample("15S").ffill() + time_series_list.append(df) + df_resample = pd.concat(time_series_list, axis=1).reset_index() + df_resample["metric"] = df_resample.drop("ts", axis=1).sum(axis=1) + df_resample["ts"] = df_resample["ts"].apply(lambda x: x.timestamp()) + return df_resample[["ts", "metric"]] + + +def parse_aggregate_result(result: MetricResult) -> float: + try: + if result.code != 0: + raise Exception(f"result: code={result.code} msg=({result.err_msg})") + if len(result.data) == 0: + raise Exception(f"Query no data.") + + return float(result.data[0].to_dict()["value"][1]) + except Exception as e: + logger.error(e) + return settings.DEFAULT_AGG_RESULT + + +class Table: + def __init__(self, reader: MetricReader, cluster: str, instance: str) -> None: + self._reader = reader + self._cluster = cluster + self._instance = instance + self._table_name = None + + +class ResourceTable(Table): + def __init__( + self, reader: MetricReader, cluster: str, instance: str, resource: str, tag: str + ) -> None: + Table.__init__(self, reader, cluster, instance) + self._resource = resource + self._tag = tag + + +class PredictTable(ResourceTable): + """The table that get the history data for model training. + + Args: + Table (class): The class contain base attributes. + """ + + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + ) -> None: + """Init the PredictTable. + + Args: + reader (MetricReader): Prometheus reader + cluster (str): cluster id + instance (str): instance id + resource (str): resource type. reading conf/colocation_settings.py + tag (str): aggravate level. reading conf/colocation_settings.py + max (float): the max val for this . eg. for cpu is nr_cpu * 100, for memory is nr_byte. + """ + ResourceTable.__init__(self, reader, cluster, instance, resource, tag) + self._max = max + + @abstractmethod + def range_query(self, start_time: float, end_time: float) -> pd.DataFrame: + """query the range metric, process it for model training + + Args: + start_time (float): range start time + end_time (float): range end time + + Returns: + pd.DataFrame: must include colmuns=['ts', 'metric] and all 'metric' must in [0, 100] + """ + pass + + +class AllocationTable(ResourceTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + ) -> None: + ResourceTable.__init__(self, reader, cluster, instance, resource, tag) + self._max = max + + @abstractmethod + def query_allocation( + self, + ) -> float: + """get the resource allocation percent + + Returns: + float: must in [0, 100] + """ + pass + + +class MaxTable(ResourceTable): + def __init__( + self, reader: MetricReader, cluster: str, instance: str, resource: str, tag: str + ) -> None: + ResourceTable.__init__(self, reader, cluster, instance, resource, tag) + + @abstractmethod + def query_max( + self, + ) -> float: + """get the resource max quota + + Returns: + float: follow the resource type, the value will be used in normalized. + """ + + +class ProcCpuTotalTable(PredictTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + ) -> None: + PredictTable.__init__(self, reader, cluster, instance, resource, tag, max) + self._table_name = settings.TABLE_PROC_CPU_TOTAL + + def range_query(self, start_time: float, end_time: float) -> pd.DataFrame: + df = pd.DataFrame() + task = RangeQueryTask( + self._table_name, start_time=start_time, end_time=end_time + ).append_equal_filter("instance", self._instance) + + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: instance={self._instance} range={end_time-start_time:.1f} result: code=({result.code}) msg=({result.err_msg})" + ) + + if len(result.data) == 0: + logger.warning(f"Query {self._table_name} no data.") + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + df["ts"] = [float(val[0]) for val in result.data[0].values] + for item in list(set(result.data)): + if item.labels["mode"] in set(settings.PORC_CPU_USGAE): + df[item.labels["mode"]] = [ + float(val[1]) / self._max * 100 for val in item.values + ] + + df = df.assign(metric=df[settings.PORC_CPU_USGAE].sum(axis=1)) + return df[["ts", "metric"]] + except Exception as e: + traceback.print_exc() + logger.error(e) + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + +class ContainerCpuacctStatTable(PredictTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + level: str, + resource: str, + tag: str, + max: float, + ) -> None: + PredictTable.__init__(self, reader, cluster, instance, resource, tag, max) + self._level = level + self._table_name = settings.TABLE_CONTAINER_CPUACCT_STAT + self._value = "total" + + def range_query(self, start_time: float, end_time: float) -> pd.DataFrame: + df = pd.DataFrame() + task = ( + RangeQueryTask(self._table_name, start_time=start_time, end_time=end_time) + .append_equal_filter("instance", self._instance) + .append_equal_filter("bvt", self._level) + .append_equal_filter("value", self._value) + .append_equal_filter("container", "None") + ) + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: bvt={self._level} instance={self._instance} value={self._value} range={end_time-start_time:.1f} result: code=({result.code}) msg=({result.err_msg})" + ) + + if len(result.data) == 0: + logger.warning(f"Query {self._table_name} no data.") + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + df = aggravate_pod(result) + df["metric"] = df["metric"] / self._max * 100 + return df[["ts", "metric"]] + except Exception as e: + traceback.print_exc() + logger.error(e) + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + def query_history_max( + self, ns: str, pod: str, con: str, end: float, interval: str + ) -> float: + task = ( + InstantQueryTask(self._table_name, end, "max_over_time", interval) + .append_equal_filter("instance", self._instance) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + .append_equal_filter("value", self._value) + ) + result = self._reader.instant_query([task]) + data = parse_aggregate_result(result) + if data is None: + logger.error( + f"No avail data: Query table={self._table_name} agg=max_over_time interval={interval} agg_val={None} ins={self._instance} ns={ns} pod={pod} con={con} value={self._value}" + ) + return data + + def query_cpu_time(self, start, end) -> List[dict]: + # query all pod + task = ( + RangeQueryTask(self._table_name, start, end) + .append_equal_filter("instance", self._instance) + .append_equal_filter("bvt", settings.TAG_LS) + .append_equal_filter("container", "None") + .append_wildcard_filter("value", "user|system") + ) + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. result: code=({result.code}) msg=({result.err_msg})" + ) + pod_info = {} + for item in result.data: + labels = item["labels"] + values = item["values"] + key = f"{labels['namespace']}-{labels['pod']}-{labels['container']}" + if key not in pod_info.keys(): + pod_info[key] = { + "ns": labels["namespace"], + "pod": labels["pod"], + "con": labels["container"], + } + pod_info[key][labels["value"]] = values + return [val for val in pod_info.values()] + except Exception as e: + logger.error(e) + return [] + + +class ContainerCfsQuotaTable(AllocationTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + ) -> None: + AllocationTable.__init__(self, reader, cluster, instance, resource, tag, max) + self._table_name = settings.TABLE_CONTAINER_CFS_QUOTA + self._level = settings.TAG_LS + self._value = "quota_ratio" + self._max = max + + def query_latest_quota_ratio(self, namespace, pod, con) -> float: + task = ( + InstantQueryTask(self._table_name, None) + .append_equal_filter("bvt", self._level) + .append_equal_filter("value", self._value) + .append_equal_filter("instance", self._instance) + .append_equal_filter("namespace", namespace) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + ) + result = self._reader.instant_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: bvt={self._level} instance={self._instance} value={self._value} result: code={result.code} msg=({result.err_msg})" + ) + if len(result.data) == 0: + raise Exception(f"Query {self._table_name} no data") + return float(result.data[0].value[1]) + + except Exception as e: + traceback.print_exc() + logger.error(e) + return settings.DEFAULT_ALLOCAITON_RESULT + + def query_allocation(self) -> float: + """calculate the LS CPU allocation quota + + Returns: + float: sum of all LS Pod CPU quota_ratio, and normalize the value to the range of [0, 60]. + """ + # TODO use the pod level quota ratio, handle one pod with multi containers + sum_val = 0 + task = ( + InstantQueryTask(self._table_name, None) + .append_equal_filter("bvt", self._level) + .append_equal_filter("value", self._value) + .append_equal_filter("instance", self._instance) + .append_equal_filter("container", "None") + ) + result = self._reader.instant_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: bvt={self._level} instance={self._instance} value={self._value} result: code={result.code} msg=({result.err_msg})" + ) + + if len(result.data) == 0: + raise Exception(f"Query {self._table_name} no data") + + for item in result.data: + pod_limit = float(item.to_dict()["value"][1]) + if pod_limit >= self._max: + from lib.pod_state import pod_mgr + + labels = item.to_dict()["labels"] + pod_limit = pod_mgr.query_max( + self._cluster, + self._instance, + labels["namespace"], + labels["pod"], + labels["container"], + settings.RESOURCE_CPU, + ) + if pod_limit is None: + logger.error( + f"get the history max cpu replace the unlimited cpu error. ins={self._instance} ns={labels['namespace']} pod={labels['pod']} con={labels['container']} pod_limit={pod_limit} self._max={self._max}" + ) + pod_limit = 0 + sum_val += pod_limit * settings.POD_UNLIMITED_FACTOR + + logger.warning( + f"cpu allocation_rate({sum_val / self._max * 100}) = sum_val({sum_val}) / max({self._max}) * 100" + ) + """ not allowed return val more than 60 """ + return min(settings.CPU_MAX_ALLOCATION_PRECENT, sum_val / self._max * 100) + except Exception as e: + traceback.print_exc() + logger.error(e) + return settings.DEFAULT_ALLOCAITON_RESULT + + +class ProcCpusTable(MaxTable): + def __init__( + self, reader: MetricReader, cluster: str, instance: str, resource: str, tag: str + ) -> None: + MaxTable.__init__(self, reader, cluster, instance, resource, tag) + self._table_name = settings.TABLE_PROC_CPUS + self._mode = "total" + self._total = None + + def query_max(self) -> float: + if self._total is not None: + return self._total + + task = ( + InstantQueryTask(self._table_name, None, "count") + .append_equal_filter("mode", self._mode) + .append_equal_filter("instance", self._instance) + ) + result = self._reader.instant_query([task]) + nr_cpu = parse_aggregate_result(result) + if nr_cpu is not None: + self._total = nr_cpu * 100 + else: + logger.error( + f"No avail data: Query table={self._table_name} agg=count interval=None agg_val={None} ins={self._instance} ns={None} pod={None} con={None} value={None}" + ) + + return self._total + + +class ProcMeminfoTable(MaxTable, PredictTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + ) -> None: + MaxTable.__init__(self, reader, cluster, instance, resource, tag) + PredictTable.__init__(self, reader, cluster, instance, resource, tag, max) + self._table_name = settings.TABLE_PROC_MEMINFO + self._max_value = "MemTotal" + self._avail_value = "MemAvailable" + self._total = None + + def range_query(self, start_time: float, end_time: float) -> DataFrame: + df = pd.DataFrame() + task = ( + RangeQueryTask(self._table_name, start_time=start_time, end_time=end_time) + .append_equal_filter("instance", self._instance) + .append_equal_filter("value", self._avail_value) + ) + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: instance={self._instance} value={self._avail_value} range={end_time-start_time:.1f} result: code=({result.code}) msg=({result.err_msg})" + ) + if len(result.data) != 1: + raise Exception( + f"Query {self._table_name} data num is incorrect. size=({len(result.data)}) args: instance={self._instance} value={self._avail_value} range={end_time-start_time:.1f}" + ) + + df["ts"] = [float(val[0]) for val in result.data[0].values] + """ calculate the usage_rate = (total - available) / total * 100""" + df["metric"] = [ + (self._max - float(val[1]) * 1024) / self._max * 100 + for val in result.data[0].values + ] + return df[["ts", "metric"]] + except Exception as e: + traceback.print_exc() + logger.error(e) + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + def query_max(self) -> float: + if self._total is not None: + logger.debug(f"{self._table_name} total_max={self._total}") + return self._total + task = ( + InstantQueryTask(self._table_name, None) + .append_equal_filter("instance", self._instance) + .append_equal_filter("value", self._max_value) + ) + result = self._reader.instant_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: mode={self._mode} result: code=({result.code}) msg=({result.err_msg})" + ) + if len(result.data) == 0: + raise Exception(f"Query {self._table_name} no data.") + + """ calculate the total memory in bytes """ + self._total = float(result.data[0].to_dict()["value"][1]) * 1024 + return self._total + except Exception as e: + traceback.print_exc() + logger.error(e) + return settings.DEFAULT_MAX_RESULT + + +class ContainerMemutilTable(PredictTable, AllocationTable): + def __init__( + self, + reader: MetricReader, + cluster: str, + instance: str, + resource: str, + tag: str, + max: float, + level: str, + ) -> None: + PredictTable.__init__(self, reader, cluster, instance, resource, tag, max) + AllocationTable.__init__(self, reader, cluster, instance, resource, tag, max) + self._table_name = settings.TABLE_CONTAINER_MEMUTIL + self._usage_value = "usage" + self._limit_value = "limit" + self._level = level + + def range_query(self, start_time: float, end_time: float) -> DataFrame: + task = ( + RangeQueryTask(self._table_name, start_time=start_time, end_time=end_time) + .append_equal_filter("instance", self._instance) + .append_equal_filter("value", self._usage_value) + .append_equal_filter("bvt", self._level) + .append_equal_filter("container", "None") + ) + result = self._reader.range_query([task]) + + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: instance={self._instance} value={self._usage_value} range={end_time-start_time:.1f} result: code=({result.code}) msg=({result.err_msg})" + ) + + if len(result.data) == 0: + logger.warning(f"Query {self._table_name} no data.") + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + df = aggravate_pod(result) + df["metric"] = df["metric"] / self._max * 100 + return df[["ts", "metric"]] + except Exception as e: + traceback.print_exc() + logger.error(e) + return pd.DataFrame(settings.DEFAULT_PREDICT_QUERY_RESULT) + + def query_history_max( + self, ns: str, pod: str, con: str, end: float, interval: str + ) -> float: + task = ( + InstantQueryTask(self._table_name, end, "max_over_time", interval) + .append_equal_filter("instance", self._instance) + .append_equal_filter("value", self._limit_value) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + ) + result = self._reader.instant_query([task]) + data = parse_aggregate_result(result) + if data is None: + logger.error( + f"No avail data: Query table={self._table_name} agg=max_over_time interval={interval} agg_val={None} ins={self._instance} ns={ns} pod={pod} con={con} value={self._limit_value}" + ) + return data + + def query_allocation(self) -> float: + sum_val = 0 + task = ( + InstantQueryTask(self._table_name, None) + .append_equal_filter("bvt", self._level) + .append_equal_filter("value", self._limit_value) + .append_equal_filter("instance", self._instance) + .append_equal_filter("container", "None") + ) + result = self._reader.instant_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: bvt={self._level} instance={self._instance} value={self._limit_value} container=None result: code={result.code} msg=({result.err_msg})" + ) + if len(result.data) == 0: + raise Exception( + f"Query {self._table_name} no data. args: bvt={self._level} instance={self._instance} value={self._limit_value} container=None " + ) + + for item in result.data: + pod_limit = float(item.to_dict()["value"][1]) + labels = item.to_dict()["labels"] + if pod_limit >= self._max: + from lib.pod_state import pod_mgr + + pod_limit = pod_mgr.query_max( + self._cluster, + self._instance, + labels["namespace"], + labels["pod"], + labels["container"], + settings.RESOURCE_MEMORY, + ) + sum_val += pod_limit * settings.POD_UNLIMITED_FACTOR + """ not allowed return val more than 100 """ + return min(settings.MEM_MAX_ALLOCATION_PERCENT, sum_val / self._max * 100) + except Exception as e: + traceback.print_exc() + logger.error(e) + return settings.DEFAULT_ALLOCAITON_RESULT + + +class ContainerPmuEvents(Table): + def __init__(self, reader: MetricReader, cluster: str, instance: str) -> None: + Table.__init__(self, reader, cluster, instance) + self._table_name = settings.TABLE_CONTANINER_PMU_EVENTS + self._value = "CPI" + + def query_history_stddev( + self, ns: str, pod: str, con: str, end: float, interval: str + ) -> float: + task = ( + InstantQueryTask(self._table_name, end, "stddev_over_time", interval) + .append_equal_filter("instance", self._instance) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + .append_equal_filter("value", self._value) + ) + result = self._reader.instant_query([task]) + data = parse_aggregate_result(result) + if data is None: + logger.error( + f"No avail data: Query table={self._table_name} agg=stddev_over_time interval={interval} agg_val={None} ins={self._instance} ns={ns} pod={pod} con={con} value={self._value}" + ) + return data + + def query_history_mean( + self, ns: str, pod: str, con: str, end: float, interval: str + ) -> float: + task = ( + InstantQueryTask(self._table_name, end, "avg_over_time", interval) + .append_equal_filter("instance", self._instance) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + .append_equal_filter("value", self._value) + ) + result = self._reader.instant_query([task]) + data = parse_aggregate_result(result) + if data is None: + logger.error( + f"No avail data: Query table={self._table_name} agg=avg_over_time interval={interval} agg_val={None} ins={self._instance} ns={ns} pod={pod} con={con} value={self._value}" + ) + return data + + def query_lastest_cpi(self, start_time, end_time) -> List[dict]: + container_list = [] + task = ( + RangeQueryTask(self._table_name, start_time=start_time, end_time=end_time) + .append_equal_filter("instance", self._instance) + .append_equal_filter("bvt", settings.TAG_LS) + .append_equal_filter("value", self._value) + ) + + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: instance={self._instance} result: code={result.code} msg=({result.err_msg})" + ) + for item in result.data: + labels = item.to_dict()["labels"] + if labels["container"] != "None": + container_list.append( + { + "ns": labels["namespace"], + "pod": labels["pod"], + "con": labels["container"], + "data": item.values, + } + ) + return container_list + except Exception as e: + logger.error(e) + return [] + + +class ContaierCfsStatistics(Table): + def __init__(self, reader: MetricReader, cluster: str, instance: str) -> None: + Table.__init__(self, reader, cluster, instance) + self._table_name = settings.TABLE_CONTANINER_CFS_STATISTICS + + def query_serve_util_rate(self, start_time, end_time) -> List[dict]: + container_info = {} + task = ( + RangeQueryTask(self._table_name, start_time=start_time, end_time=end_time) + .append_equal_filter("instance", self._instance) + .append_equal_filter("bvt", settings.TAG_LS) + .append_equal_filter("value", "serveutil") + ) + + result = self._reader.range_query([task]) + try: + if result.code != 0: + raise Exception( + f"Query {self._table_name} Failed. args: instance={self._instance} result: code={result.code} msg=({result.err_msg})" + ) + for item in result.data: + labels = item.to_dict()["labels"] + key = f"{labels['namespace']}-{labels['pod']}-{labels['container']}" + if labels["container"] != "None": + container_info[key] = { + "ns": labels["namespace"], + "pod": labels["pod"], + "con": labels["container"], + "serveutil": [val for val in item.values], + } + return [v for v in container_info.values()] + except Exception as e: + logger.error(e) + return [] + + def query_p99_one_week(self, ns: str, pod: str, con: str, end: float) -> float: + """get the p99 serve util, because the serveutil bigger the status better, + so we get the p01 as the threshold + + Args: + ns (str): _description_ + pod (str): _description_ + con (str): _description_ + end (float): _description_ + + Returns: + float: _description_ + """ + task = ( + InstantQueryTask( + metric_name=self._table_name, + time=end, + aggregation="quantile_over_time", + interval="7d", + aggregation_val="0.01", + ) + .append_equal_filter("instance", self._instance) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + .append_equal_filter("value", "serveutil") + ) + + result = self._reader.instant_query([task]) + data = parse_aggregate_result(result) + if data is None: + logger.error( + f"No avail data: Query table={self._table_name} agg=quantile_over_time interval=7d agg_val=0.01 ins={self._instance} ns={ns} pod={pod} con={con} value=serveutil" + ) + return data diff --git a/sysom_server/sysom_colocation/lib/utils.py b/sysom_server/sysom_colocation/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5559ed2638056c3769e92bfefd68e1b04a29fa23 --- /dev/null +++ b/sysom_server/sysom_colocation/lib/utils.py @@ -0,0 +1,83 @@ +import time + +import pytz +import conf.settings as settings +from typing import List +from metric_reader.metric_reader import MetricReader, RangeQueryTask +from clogger import logger +from datetime import datetime, date + + +def get_today_zero_ts() -> float: + today = date.today() + today_tuple = today.timetuple() + today_zero_ts = time.mktime(today_tuple) + return today_zero_ts + + +def ts_2_hour(ts: float) -> int: + return datetime.fromtimestamp(ts, pytz.timezone("Asia/Shanghai")).hour + + +def collect_all_clusters(metric_reader: MetricReader) -> List[str]: + cluster_list = [] + res = metric_reader.get_label_values("cluster") + if len(res.data) <= 0: + logger.warning("Collect all cluster failed!") + return cluster_list + return [item for item in res.data] + + +def collect_instances_of_cluster( + cluster_id: str, metric_reader: MetricReader, interval: int +) -> List[str]: + """Collect all instances of specific cluster + + Use "sysom_proc_cpu_total" metric to collect all pods + of specific instance, need to make sure the metric has been correctlly + exported (similar to grafana variables). + + Args: + instance_id: instance id + metric_reader: MetricReader instance of metric_reader sdk + interval: time interval of query + + Returns: + List of instances + """ + + instances_list = [] + + task = RangeQueryTask( + settings.TABLE_PROC_CPU_TOTAL, + start_time=time.time() - interval, + end_time=time.time(), + ).append_equal_filter("mode", "total") + + if cluster_id != "default": + task.append_equal_filter("cluster", cluster_id) + + node_metric_res = metric_reader.range_query([task]) + if len(node_metric_res.data) <= 0: + logger.error(f"Collect instances of {cluster_id} info: no instances found!") + return instances_list + + try: + for i in range(len(node_metric_res.data)): + labels = node_metric_res.data[i].to_dict()["labels"] + if settings.NODE_LABEL in labels: + instances_list.append(labels[settings.NODE_LABEL]) + except Exception as e: + raise e + + return list(set(instances_list)) + + +def generate_unique_key(*args, **kwargs) -> str: + val_list = [] + for arg in args: + val_list.append(arg) + + for value in kwargs.values(): + val_list.append(str(value)) + return "-".join(val_list) diff --git a/sysom_server/sysom_colocation/main.py b/sysom_server/sysom_colocation/main.py new file mode 100644 index 0000000000000000000000000000000000000000..0e55026ade35430c6e50c68447aa36ffcd38f656 --- /dev/null +++ b/sysom_server/sysom_colocation/main.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- # +""" +Time 2023/12/20 19:47 +Author: weizhen (ZouTao) +Email wodemia@linux.alibaba.com +File main.py +Description: +""" +from clogger import logger +from fastapi import FastAPI, HTTPException, Response +from app.routers import health +from prometheus_client import Gauge, CollectorRegistry, generate_latest +from conf.settings import YAML_CONFIG +import conf.settings as settings +from app.worker.period_predict_worker import PeriodPredictWorker +from app.worker.cpi_worker import CPIWorker +from app.worker.serve_util_worker import ServeUtilWorker +from app.worker.sys_worker import SysWorker +from sysom_utils import CmgPlugin, SysomFramework +import json + + +app = FastAPI() +g_cache_rt = SysomFramework.gcache(settings.CACHE_RT) +g_cache_future = SysomFramework.gcache(settings.CACHE_FUTURE) +CLUSTER_COLOCATION_LABEL_RT = ["instance", "resource", "tag", "category"] +CLUSTER_COLOCATION_LABEL_FUTURE = ["instance", "resource", "tag", "category", "future"] + +registry = CollectorRegistry() + +colocation_node_predict_rt = Gauge( + "sysom_colocation_node_predict_rt", + "sysom_colocation_node_predict_rt", + CLUSTER_COLOCATION_LABEL_RT, + registry=registry, +) + +colocation_node_predict_future = Gauge( + "sysom_colocation_node_predict_future", + "sysom_colocation_node_predict_future", + CLUSTER_COLOCATION_LABEL_FUTURE, + registry=registry, +) + + +app.include_router(health.router, prefix="/api/v1/colocation/health") +# app.include_router(health.router, prefix="/api/v1/colocation/person") + + +############################################################################# +# Write your API interface here, or add to app/routes +############################################################################# + + +def init_framwork(): + SysomFramework.init(YAML_CONFIG).load_plugin_cls(CmgPlugin).start() + logger.info("SysomFramework init finished!") + + +@app.on_event("startup") +async def on_start(): + init_framwork() + g_cache_rt.clean() + g_cache_future.clean() + PeriodPredictWorker().start() + logger.info(f"混部资源水位预测服务已启动.") + CPIWorker().start() + logger.info(f"混部干扰检测-CPI干扰检测服务已启动.") + SysWorker().start() + logger.info(f"混部干扰检测-Sys干扰检测服务已启动.") + ServeUtilWorker().start() + logger.info(f"混部干扰检测-CFS满足率干扰检测服务已启动.") + + ############################################################################# + # Perform some microservice initialization operations over here + ############################################################################# + + +@app.on_event("shutdown") +async def on_shutdown(): + pass + + +@app.get("/metrics") +def prometheus_get_metrics(): + def process_cache(cache: SysomFramework.gcache, gauge: Gauge, labels: list): + metrics_all = cache.load_all() + + if len(metrics_all) <= 0: + return + + for _, results in metrics_all.items(): + logger.debug(f"values={results}") + metrics = json.loads(results) + gauge.labels(*[metrics[label] for label in labels]).set(metrics["value"]) + + try: + process_cache( + g_cache_rt, colocation_node_predict_rt, CLUSTER_COLOCATION_LABEL_RT + ) + process_cache( + g_cache_future, + colocation_node_predict_future, + CLUSTER_COLOCATION_LABEL_FUTURE, + ) + except Exception as e: + logger.error(e) + finally: + return Response(generate_latest(registry), media_type="text/plain") diff --git a/sysom_server/sysom_diagnosis/apps/task/executor.py b/sysom_server/sysom_diagnosis/apps/task/executor.py index f06338d889e1cb94cec9b51a4d762bde52e452dd..99c44ae0b633f2934be4a6190999a1dca52db2da 100644 --- a/sysom_server/sysom_diagnosis/apps/task/executor.py +++ b/sysom_server/sysom_diagnosis/apps/task/executor.py @@ -1,29 +1,51 @@ +import time +import base64 from clogger import logger +from typing import Optional +from schedule import Scheduler from apps.task.models import JobModel from django.conf import settings from cec_base.event import Event from cec_base.consumer import Consumer +from cec_base.cec_client import StoppableThread from sysom_utils import AsyncEventExecutor, CecAsyncConsumeTask, ConfigParser from asgiref.sync import sync_to_async from datetime import datetime, timedelta from django.db.models import Q +from lib.utils import uuid_8 from .helper import DiagnosisHelper, DiagnosisTaskResult, DiagnosisJobResult +from service_scripts.base import DiagnosisJob class DiagnosisTaskExecutor(AsyncEventExecutor): def __init__(self, config: ConfigParser): super().__init__(settings.SYSOM_CEC_PRODUCER_URL, callback=self.process_event) + self._check_task_schedule: Scheduler = Scheduler() + self._check_task_process_thread: Optional[StoppableThread] = None + self._check_interval: int = settings.CHECK_INTERVAL + self._task_execute_timeout: int = settings.TASK_EXECUTE_TIMEOUT self.append_group_consume_task( settings.SYSOM_CEC_DIAGNOSIS_TASK_DISPATCH_TOPIC, settings.SYSOM_CEC_DIAGNOSIS_CONSUMER_GROUP, Consumer.generate_consumer_id(), ensure_topic_exist=True, ) + self.append_group_consume_task( + settings.SYSOM_CEC_OFFLINE_ORIGIN_DIAGNOSIS_RESULT_TOPIC, + settings.SYSOM_CEC_DIAGNOSIS_CONSUMER_GROUP, + Consumer.generate_consumer_id(), + ensure_topic_exist=True, + ) async def process_event(self, event: Event, task: CecAsyncConsumeTask): try: if task.topic_name == settings.SYSOM_CEC_DIAGNOSIS_TASK_DISPATCH_TOPIC: await self._process_task_dispatch_event(event) + elif ( + task.topic_name + == settings.SYSOM_CEC_OFFLINE_ORIGIN_DIAGNOSIS_RESULT_TOPIC + ): + await self._process_offline_origin_diagnosis_result_event(event) else: # Unexpected logger.error("Receive unknown topic event, unexpected!!") @@ -49,6 +71,107 @@ class DiagnosisTaskExecutor(AsyncEventExecutor): except Exception as exc: logger.exception(f"Diagnosis process task dispatch event error: {str(exc)}") + async def _process_offline_origin_diagnosis_result_event(self, event: Event): + """Process offline full diagnosis result event + { + # 诊断类型 + "service_name": "xxx", + + # 诊断ID,如果有的话 + "task_id": "", + + # 诊断参数,如果有的话 + "params": { + "instance": "xxx" + }, + + # 诊断命令 + "command": { + "in_order": false, + "jobs": [ + { + "instance": "172.17.0.1", + "cmd": "sysak ossre_client -s > /dev/null && cat /var/log/sysak/ossre.log" + } + ] + } + + # 结果内容编码方式(同离线诊断回传接口) + "content_encoding": "xxx", + + # 结果内容(一个job对应一个结果,如果前处理脚本只返回一个job,则回传结果列表里面只包含一个字符串代表结果即可,同离线诊断回传接口) + "results": [], + + "created_by": "xxx" + } + """ + try: + assert isinstance(event.value, dict) + service_name = event.value.get("service_name", "") + task_id = event.value.get("task_id", uuid_8()) + params = event.value.get("params", {}) + if "channel" not in params: + params["channel"] = "offline" + content_encoding = event.value.get("content_encoding", "text") + results = event.value.get("results", []) + command = event.value.get( + "command", + { + "in_order": False, + "jobs": [ + {"instance": params.get("instance", ""), "cmd": ""} + for _ in results + ], + }, + ) + created_by = event.value.get("created_by", "cec") + + if content_encoding == "base64": + results = [base64.b64decode(result).decode() for result in results] + + # 1. Get task + instance = None + try: + instance = await sync_to_async(JobModel.objects.get)(task_id=task_id) + if instance.status not in ["Ready", "Running"]: + return + except JobModel.DoesNotExist: + pass + if instance is None: + task_params = { + "task_id": task_id, + "command": command, + "created_by": created_by, + "params": params, + "service_name": service_name, + "status": "Running", + } + instance = await sync_to_async(JobModel.objects.create)(**task_params) + + # 2. Build diagnosis task result + commands = instance.command.get("jobs", []) + job_result = DiagnosisTaskResult( + 0, + job_results=[ + DiagnosisJobResult( + 0, + stdout=result, + job=DiagnosisJob.from_dict(commands[idx]), + file_list=[], + ) + for idx, result in enumerate(results) + ], + in_order=False, + ) + + # 3. Invoke postprocess script + await DiagnosisHelper.postprocess_async(instance, job_result) + + except Exception as exc: + logger.exception( + f"Diagnosis process offline full diagnosis result event error: {str(exc)}" + ) + ################################################################################################ # 诊断任务执行 ################################################################################################ @@ -59,36 +182,51 @@ class DiagnosisTaskExecutor(AsyncEventExecutor): async def _execute_diagnosis_task_by_model(self, instance: JobModel): # 1. Preprocess - res = await DiagnosisHelper.preprocess_async(instance) + channel = instance.params.get("channel", "") + ignore_channels = settings.IGNORE_TOOL_CHECK_CHANNELS + res = await DiagnosisHelper.preprocess_async( + instance, channel in ignore_channels + ) + + if not res: + raise Exception("Diagnosis preprocess error, DiagnosisTask is None") + + # 1.1 Preprocess post wrapper + is_offline = await DiagnosisHelper.preprocess_post_wrapper_async(instance, res) + await DiagnosisHelper._update_job_async(instance, status="Running") + if is_offline: + return # 2. Execute and Postprocess - if res: - if not res.offline_mode: - job_result = await DiagnosisHelper.execute_async(instance, res) - else: - job_result = DiagnosisTaskResult( - 0, - job_results=[ - DiagnosisJobResult( - 0, - stdout=item, - job=res.jobs[idx] if len(res.jobs) > idx else None, - ) - for idx, item in enumerate(res.offline_results) - ], - in_order=res.in_order, - ) - await DiagnosisHelper.postprocess_async(instance, job_result) + if not res.offline_mode: + job_result = await DiagnosisHelper.execute_async(instance, res) + else: + job_result = DiagnosisTaskResult( + 0, + job_results=[ + DiagnosisJobResult( + 0, + stdout=item, + job=res.jobs[idx] if len(res.jobs) > idx else None, + ) + for idx, item in enumerate(res.offline_results) + ], + in_order=res.in_order, + ) + await DiagnosisHelper.postprocess_async(instance, job_result) # 3. TODO: produce task execute result to cec - def start(self): - super().start() - + ################################################################################################ + # 轮询检查任务是否超时 + ################################################################################################ + def _check_task_timeout(self): # Check and mark timeout tasks - expire_minutes_ago = datetime.now() - timedelta(minutes=10) + expire_minutes_ago = datetime.now() - timedelta( + minutes=self._task_execute_timeout + ) instances = JobModel.objects.filter( - Q(created_at__lte=expire_minutes_ago) & Q(status="Running") + Q(created_at__lte=expire_minutes_ago) & (Q(status__in=["Running", "Ready"])) ) for instance in instances: instance.code = 1 @@ -96,3 +234,31 @@ class DiagnosisTaskExecutor(AsyncEventExecutor): instance.result = "Diagnosis execute task timeout" instance.err_msg = "Diagnosis execute task timeout" instance.save() + + def _check_task_thead(self): + """check channel job thead schedule""" + self._check_task_schedule.every(self._check_interval).seconds.do( + self._check_task_timeout + ) + + while True: + if ( + not self._check_task_process_thread.stopped() + and self._check_task_process_thread.is_alive() + ): + self._check_task_schedule.run_pending() + time.sleep(self._check_interval * 0.8) + + def start(self): + super().start() + + if ( + self._check_task_process_thread is not None + and not self._check_task_process_thread.stopped() + and self._check_task_process_thread.is_alive() + ): + return + + self._check_task_process_thread = StoppableThread(target=self._check_task_thead) + self._check_task_process_thread.setDaemon(True) + self._check_task_process_thread.start() diff --git a/sysom_server/sysom_diagnosis/apps/task/filter.py b/sysom_server/sysom_diagnosis/apps/task/filter.py index 8376c3f0e8c1bedc17141b4f52a304fbb84fca2c..65f04c869cabe784fd7202563bb5a08d98020366 100644 --- a/sysom_server/sysom_diagnosis/apps/task/filter.py +++ b/sysom_server/sysom_diagnosis/apps/task/filter.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- +import json import django_filters +from typing import List from clogger import logger from rest_framework.filters import BaseFilterBackend from apps.task.models import JobModel @@ -10,10 +12,16 @@ class TaskFilter(django_filters.FilterSet): service_name = django_filters.CharFilter(field_name="service_name") # https://stackoverflow.com/questions/58977818/how-to-use-django-filter-on-jsonfield channel = django_filters.CharFilter(field_name="params__channel") + params = django_filters.CharFilter(method='params_filter') + + def params_filter(self, queryset, name, value): + values: List[dict] = json.loads(value) + params = {item['key']: item['value'] for item in values if item["value"]} + return queryset.filter(params__contains=params) if value else queryset class Meta: model = JobModel - fields = ["id", "task_id", "created_by__id", "status"] + fields = ["id", "task_id", "created_by__id", "status", "params"] class IsOwnerFilterBackend(BaseFilterBackend): diff --git a/sysom_server/sysom_diagnosis/apps/task/helper.py b/sysom_server/sysom_diagnosis/apps/task/helper.py index b96f0df57b53727affd61d83ff3b51a58d55af5c..fa80ee33c3559553214303f7b9298d69fb70a842 100644 --- a/sysom_server/sysom_diagnosis/apps/task/helper.py +++ b/sysom_server/sysom_diagnosis/apps/task/helper.py @@ -10,6 +10,7 @@ from apps.task.models import JobModel from importlib import import_module from django.conf import settings from lib.utils import uuid_8 +from lib.exception import DiagnosisErrorCode, DiagnosisException from asgiref.sync import sync_to_async, async_to_sync from channel_job.job import default_channel_job_executor, JobResult from service_scripts.base import ( @@ -20,7 +21,10 @@ from service_scripts.base import ( DiagnosisPreProcessor, PostProcessResult, DiagnosisPostProcessor, + DiagnosisHookProcessor, + HookProcessResult, ) +from service_scripts.wrapper.base import DiagnosisPreProcessorPostWrapperBase class DiagnosisHelper: @@ -69,6 +73,8 @@ class DiagnosisHelper: user_id = user["id"] task_id = uuid_8() service_name = data.get("service_name", None) + if "channel" not in data: + data["channel"] = "offline" task_params = { "command": "", "task_id": task_id, @@ -93,7 +99,9 @@ class DiagnosisHelper: } @staticmethod - def preprocess(instance: JobModel, ignore_check: bool = False) -> Optional[DiagnosisTask]: + def preprocess( + instance: JobModel, ignore_check: bool = False + ) -> Optional[DiagnosisTask]: """ "Perform diagnosis preprocessing { "commands":[ @@ -159,7 +167,9 @@ class DiagnosisHelper: return DiagnosisHelper.run_subprocess(cmd) @staticmethod - async def preprocess_v1_async(instance: JobModel, ignore_check: bool = False) -> DiagnosisTask: + async def preprocess_v1_async( + instance: JobModel, ignore_check: bool = False + ) -> DiagnosisTask: """ "Perform diagnosis preprocessing { @@ -185,6 +195,11 @@ class DiagnosisHelper: # 2. Invoke preprocessing script(preprocessing script) SCRIPTS_DIR = settings.SCRIPTS_DIR service_path = os.path.join(SCRIPTS_DIR, service_name) + + # 防止任意命令执行 + if os.path.dirname(service_path) != SCRIPTS_DIR: + raise Exception(f"Invalid pre-processing script: {service_path}") + if not os.path.exists(service_path): raise Exception("Can not find script file, please check service name") try: @@ -219,7 +234,9 @@ class DiagnosisHelper: return diagnosis_task @staticmethod - async def preprocess_v2_async(instance: JobModel, ignore_check: bool = False) -> Optional[DiagnosisTask]: + async def preprocess_v2_async( + instance: JobModel, ignore_check: bool = False + ) -> Optional[DiagnosisTask]: """Pre-processing V2 Args: @@ -287,7 +304,9 @@ class DiagnosisHelper: return diagnosis_task @staticmethod - async def preprocess_async(instance: JobModel, ignore_check: bool = False) -> Optional[DiagnosisTask]: + async def preprocess_async( + instance: JobModel, ignore_check: bool = False + ) -> Optional[DiagnosisTask]: """ "Perform diagnosis preprocessing { "commands":[ @@ -303,26 +322,83 @@ class DiagnosisHelper: """ diagnosis_task: Optional[DiagnosisTask] = None try: - diagnosis_task = await DiagnosisHelper.preprocess_v2_async(instance, ignore_check) + diagnosis_task = await DiagnosisHelper.preprocess_v2_async( + instance, ignore_check + ) if diagnosis_task is None: - diagnosis_task = await DiagnosisHelper.preprocess_v1_async(instance, ignore_check) + diagnosis_task = await DiagnosisHelper.preprocess_v1_async( + instance, ignore_check + ) # If the pre-processor executes successfully, the parameters are compliant # and the Job instance is updated await DiagnosisHelper._update_job_async( - instance, command=diagnosis_task.to_dict(), status="Running" + instance, command=diagnosis_task.to_dict() ) except Exception as exc: - logger.exception(f"Diagnosis preprocess error: {str(exc)}") + logger.exception( + f"Diagnosis preprocess error: {instance.task_id} -> {str(exc)}" + ) await DiagnosisHelper._update_job_async( instance, result="Diagnosis preprocess error", status="Fail", - code=1, + code=DiagnosisErrorCode.PREPROCESS_ERROR, err_msg=f"Diagnosis preprocess error: {str(exc)}", ) return diagnosis_task + @staticmethod + async def preprocess_post_wrapper_async( + instance: JobModel, diagnosis_task: DiagnosisTask + ) -> bool: + """Preprocess post wrapper + + Args: + diagnosis_task (DiagnosisTask): Diagnosis task + wrapper (Type[DiagnosisPreProcessor]): Diagnosis preprocessor post wrapper + """ + + def _get_pre_processor_post_wrapper( + wrapper_type: str, + ) -> Type[DiagnosisPreProcessorPostWrapperBase]: + try: + return import_module( + f"service_scripts.wrapper.{wrapper_type}" + ).DiagnosisPreProcessorPostWrapper + except Exception as e: + raise Exception(f"No Pre-processor-post-wrapper available => {str(e)}") + + try: + # 1. Get params + params = instance.params.copy() + is_offline = params.get("channel", "") == "offline" + if isinstance(instance.params, str): + try: + params = json.loads(instance.params) + except Exception as exc: + raise Exception(f"Task params loads error: {str(exc)}") + if "sysom_preprocess_post_wrapper" not in params: + return is_offline + + preprocess_post_wrapper = params.pop("sysom_preprocess_post_wrapper") + wrapper = _get_pre_processor_post_wrapper(preprocess_post_wrapper)() + wrapper.process(instance.task_id, diagnosis_task) + await DiagnosisHelper._update_job_async( + instance, command=diagnosis_task.to_dict() + ) + except Exception as exc: + logger.exception(f"Diagnosis preprocess post wrapper error: {str(exc)}") + await DiagnosisHelper._update_job_async( + instance, + result="Diagnosis preprocess post wrapper error", + status="Fail", + code=DiagnosisErrorCode.PREPROCESS_POST_WRAPPER_ERROR, + err_msg=f"Diagnosis preprocess post wrapper error: {str(exc)}", + ) + + return is_offline + @staticmethod async def execute_async( instance: JobModel, diagnosis_task: DiagnosisTask @@ -413,7 +489,7 @@ class DiagnosisHelper: instance, result="Diagnosis execute task error", status="Fail", - code=1, + code=DiagnosisErrorCode.EXEC_ERROR, err_msg=diagnosis_task_result.err_msg, ) return diagnosis_task_result @@ -427,6 +503,11 @@ class DiagnosisHelper: SCRIPTS_DIR = settings.SCRIPTS_DIR service_post_name = service_name + "_post" service_post_path = os.path.join(SCRIPTS_DIR, service_post_name) + + # 防止任意命令执行 + if os.path.dirname(service_post_path) != SCRIPTS_DIR: + raise Exception(f"Invalid post-processing script: {service_post_path}") + if not os.path.exists(service_post_path): raise Exception( f"No matching post-processing script found: {service_post_path}" @@ -436,7 +517,9 @@ class DiagnosisHelper: with tempfile.NamedTemporaryFile(mode="w") as tmp_file: try: # 将要传递的中间结果写入到临时文件当中 - tmp_file.write("".join([item.stdout for item in diagnosis_task_result.job_results])) + tmp_file.write( + "".join([item.stdout for item in diagnosis_task_result.job_results]) + ) tmp_file.flush() resp = await DiagnosisHelper.run_subprocess_async( [service_post_path, tmp_file.name, instance.task_id] @@ -534,7 +617,7 @@ class DiagnosisHelper: await DiagnosisHelper._update_job_async( instance, status="Fail", - code=code, + code=DiagnosisErrorCode.EXEC_ERROR, result=diagnosis_task_result.job_results[0].stdout, err_msg=err_msg, ) @@ -554,7 +637,10 @@ class DiagnosisHelper: if post_process_result.code != 0: # 后处理脚本认为诊断出错 await DiagnosisHelper._update_job_async( - instance, err_msg=post_process_result.err_msg, status="Fail" + instance, + err_msg=post_process_result.err_msg, + status="Fail", + code=DiagnosisErrorCode.POSTPROCESS_ERROR, ) else: # 后处理脚本执行成功,更新任务状态 @@ -567,6 +653,45 @@ class DiagnosisHelper: instance, result="Diagnosis postprocess error", status="Fail", - code=1, + code=DiagnosisErrorCode.POSTPROCESS_ERROR, err_msg=f"Diagnosis postprocess error: {str(exc)}", ) + + @staticmethod + def invoke_diagnosis_hook( + instance: JobModel, hook_params: dict + ) -> HookProcessResult: + return async_to_sync(DiagnosisHelper.invoke_diagnosis_hook_async)( + instance, hook_params + ) + + @staticmethod + async def invoke_diagnosis_hook_async( + instance: JobModel, hook_params: dict + ) -> HookProcessResult: + """Invoke task hook""" + + def _get_diagnosis_hook(service_name: str) -> Type[DiagnosisHookProcessor]: + try: + return import_module( + f"service_scripts.{service_name}_hook" + ).HookProcessor + except Exception as e: + raise Exception(f"No Diagnosis-Hook-Processor available => {str(e)}") + + res = HookProcessResult(code=1, err_msg="Invoke diagnosis hook error", data={}) + try: + # 1. Get params + params = instance.params.copy() + if isinstance(instance.params, str): + try: + params = json.loads(instance.params) + except Exception as exc: + raise Exception(f"Task params loads error: {str(exc)}") + service_name = params.get("service_name", "") + + hooker = _get_diagnosis_hook(service_name)(service_name) + res = await hooker.invoke_hook(instance, hook_params) + except Exception as exc: + logger.exception(f"Diagnosis hook invoke error: {str(exc)}") + return res diff --git a/sysom_server/sysom_diagnosis/apps/task/migrations/0004_auto_20240102_1741.py b/sysom_server/sysom_diagnosis/apps/task/migrations/0004_auto_20240102_1741.py new file mode 100644 index 0000000000000000000000000000000000000000..98ed24f3a12e4424b873a299df25be88a57cc74f --- /dev/null +++ b/sysom_server/sysom_diagnosis/apps/task/migrations/0004_auto_20240102_1741.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.16 on 2024-01-02 09:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('task', '0003_auto_20230926_1509'), + ] + + operations = [ + migrations.AlterField( + model_name='jobmodel', + name='command', + field=models.JSONField(verbose_name='shell文本'), + ), + migrations.AlterField( + model_name='jobmodel', + name='created_by', + field=models.TextField(verbose_name='创建人'), + ), + ] diff --git a/sysom_server/sysom_diagnosis/apps/task/models.py b/sysom_server/sysom_diagnosis/apps/task/models.py index 393f4b2b3a36e31bd2816d58b8945363ae06b1f8..1d5ef5a6c52f28668d0c3cc45adecb1fdc3309b3 100644 --- a/sysom_server/sysom_diagnosis/apps/task/models.py +++ b/sysom_server/sysom_diagnosis/apps/task/models.py @@ -18,7 +18,7 @@ class JobModel(BaseModel): err_msg = models.TextField(verbose_name="诊断错误信息", default="") result = models.TextField(verbose_name="shell结果") params = models.JSONField(verbose_name="params") - created_by = models.IntegerField(verbose_name='创建人') + created_by = models.TextField(verbose_name='创建人') def __str__(self): return f"Job: {self.task_id}" diff --git a/sysom_server/sysom_diagnosis/apps/task/urls.py b/sysom_server/sysom_diagnosis/apps/task/urls.py index 1c59eeb93f6f49f2a1f76206f16e0c3ca4d50833..d11514072d5318d1464845e3875fe81e1a6382c4 100644 --- a/sysom_server/sysom_diagnosis/apps/task/urls.py +++ b/sysom_server/sysom_diagnosis/apps/task/urls.py @@ -14,6 +14,7 @@ router = DefaultRouter() router.register('tasks', views.TaskAPIView) urlpatterns = [ + path('api/v1/tasks/task_hook/', views.TaskAPIView.as_view({'post': 'task_hook'})), path('api/v1/tasks/sbs_task_create/', views.TaskAPIView.as_view({'post': 'sbs_task_create'})), path('api/v1/tasks/sbs_task_result/', views.TaskAPIView.as_view({'post': 'sbs_task_result'})), path('api/v1/tasks/offline_import/', views.TaskAPIView.as_view({'post': 'offline_import'})), diff --git a/sysom_server/sysom_diagnosis/apps/task/views.py b/sysom_server/sysom_diagnosis/apps/task/views.py index a16fb2905ebb6e5b4492e3612632e0610420672e..24a41429b1a7343afe54abae15af602b1a8e2009 100644 --- a/sysom_server/sysom_diagnosis/apps/task/views.py +++ b/sysom_server/sysom_diagnosis/apps/task/views.py @@ -1,3 +1,5 @@ +import enum +import base64 import requests import tempfile import os @@ -17,11 +19,17 @@ from service_scripts.base import FileItem from asgiref.sync import async_to_sync from .helper import DiagnosisHelper from service_scripts.base import ( + DiagnosisJob, DiagnosisJobResult, DiagnosisTaskResult, ) +class ContextType(enum.Enum): + TEXT = 'text' + BASE64 = 'base64' + + class TaskAPIView( CommonModelViewSet, mixins.ListModelMixin, @@ -31,7 +39,12 @@ class TaskAPIView( ): queryset = JobModel.objects.all().order_by("-created_at") serializer_class = seriaizer.JobListSerializer - filter_backends = (IsOwnerFilterBackend, DjangoFilterBackend, SearchFilter, OrderingFilter) + filter_backends = ( + IsOwnerFilterBackend, + DjangoFilterBackend, + SearchFilter, + OrderingFilter, + ) search_fields = ("id", "task_id", "created_by__id", "status", "params") # 模糊查询 filterset_class = TaskFilter # 精确查询 authentication_classes = [TokenAuthentication] @@ -95,9 +108,16 @@ class TaskAPIView( if not res["success"]: return ErrorResponse(msg=res.get("message", "Missing parameters")) data = request.data + diagnosis_params = data.pop("params", {}) + params = { + **data, + **diagnosis_params + } + if "channel" not in params: + params["channel"] = settings.DEFAULT_CHANNEL # 3. Create Task - instance = DiagnosisHelper.init(data, getattr(request, "user")) + instance = DiagnosisHelper.init(params, getattr(request, "user")) self.produce_event_to_cec( settings.SYSOM_CEC_DIAGNOSIS_TASK_DISPATCH_TOPIC, {"task_id": instance.task_id}, @@ -125,6 +145,8 @@ class TaskAPIView( "service_name": data["service_name"], **data["params"], } + if "channel" not in params: + params["channel"] = "offline" # 2. Create Task instance = DiagnosisHelper.init(params, getattr(request, "user")) @@ -132,7 +154,9 @@ class TaskAPIView( # 3. Invoke preprocess script diagnosis_task = DiagnosisHelper.preprocess(instance, True) response = seriaizer.JobRetrieveSerializer(instance) - self.produce_event_to_cec(settings.SYSOM_CEC_DIAGNOSIS_TASK_CREATED, response.data) + self.produce_event_to_cec( + settings.SYSOM_CEC_DIAGNOSIS_TASK_CREATED, response.data + ) if diagnosis_task is None: return ErrorResponse( msg=f"Preprocess script invoke error: {instance.err_msg}" @@ -152,12 +176,23 @@ class TaskAPIView( obj_list = request.FILES.getlist("files") task_id = request.POST.get("task_id", None) brief = request.POST.get("brief", False) + content_encoding = request.POST.get("content_encoding", "text") results = request.POST.getlist("results", None) + try: + content_encoding = ContextType(content_encoding) + except ValueError: + return ErrorResponse("content_encoding field can only `text` or `base64`!") + if task_id is None or results is None: return ErrorResponse( f"Missing params, required both and " ) + if content_encoding.value == 'base64': + results = [ + base64.b64decode(result).decode() for result in results + ] + # 1. Get task instance = JobModel.objects.get(task_id=task_id) if instance is None: @@ -181,13 +216,16 @@ class TaskAPIView( file_list.append(file_item) # 3. Build diagnosis task result + commands = instance.command.get("jobs", []) job_result = DiagnosisTaskResult( 0, job_results=[ DiagnosisJobResult( - 0, stdout=result, job=None, file_list=file_list + 0, stdout=result, + job=DiagnosisJob.from_dict(commands[idx]), + file_list=file_list ) - for result in results + for idx, result in enumerate(results) ], in_order=False, ) @@ -202,6 +240,30 @@ class TaskAPIView( logger.exception(e) return ErrorResponse(msg=str(e)) + def task_hook(self, request, *args, **kwargs): + """Invoke task hook + + Args: + request (_type_): _description_ + """ + try: + # 1. Check required params + res = self.require_param_validate(request, ["task_id", "params"]) + if not res["success"]: + return ErrorResponse(msg=res.get("message", "Missing parameters")) + data = request.data + task_id = data.get("task_id", None) + params = data.get("params", {}) + instance = JobModel.objects.get(task_id=task_id) + res = DiagnosisHelper.invoke_diagnosis_hook(instance, params) + if res.code == 200: + return success(res.data) + else: + return ErrorResponse(msg=res.err_msg) + except Exception as e: + logger.exception(e) + return ErrorResponse(msg=str(e)) + def offline_import(self, request, *args, **kwargs): """Offline import of diagnosis logs""" try: diff --git a/sysom_server/sysom_diagnosis/conf/common.py b/sysom_server/sysom_diagnosis/conf/common.py index bd63e3538f90756d03bc0803836c1c5568b145ce..69d4ffb9c313b2faed5698a7fb1357724467c044 100644 --- a/sysom_server/sysom_diagnosis/conf/common.py +++ b/sysom_server/sysom_diagnosis/conf/common.py @@ -21,6 +21,9 @@ SysomFramework.init(YAML_CONFIG) SCRIPTS_DIR = os.path.join(BASE_DIR, 'service_scripts') +DEFAULT_CHANNEL = YAML_CONFIG.get_service_config().get("default_channel", "ssh") +IGNORE_TOOL_CHECK_CHANNELS = YAML_CONFIG.get_service_config().get("ignore_tool_check_channels", ["offline"]) + ################################################################## # Cec settings ################################################################## @@ -29,6 +32,9 @@ SYSOM_CEC_PRODUCER_URL = YAML_CONFIG.get_cec_url(CecTarget.PRODUCER) SYSOM_CEC_DIAGNOSIS_CONSUMER_GROUP = "SYSOM_CEC_DIAGNOSIS_CONSUMER_GROUP" # 诊断任务下发主题(由 View -> Executor) SYSOM_CEC_DIAGNOSIS_TASK_DISPATCH_TOPIC = "SYSOM_CEC_DIAGNOSIS_TASK_DISPATCH_TOPIC" +# 离线诊断回传主题(回传诊断命令执行的原始结果) +SYSOM_CEC_OFFLINE_ORIGIN_DIAGNOSIS_RESULT_TOPIC = "SYSOM_CEC_OFFLINE_ORIGIN_DIAGNOSIS_RESULT_TOPIC" + # 诊断任务创建成功(已执行前处理脚本) SYSOM_CEC_DIAGNOSIS_TASK_CREATED = "SYSOM_CEC_DIAGNOSIS_TASK_CREATED" @@ -116,3 +122,19 @@ REST_FRAMEWORK = { 'UNICODE_JSON': True, 'EXCEPTION_HANDLER': 'lib.exception.exception_handler' } + +########################################################################################## +# Check task interval thread config +########################################################################################## +CHECK_INTERVAL = YAML_CONFIG.get_service_config().checkinterval # check status running task interval 60 seconds +TASK_EXECUTE_TIMEOUT = YAML_CONFIG.get_service_config().taskexecutetimeout # default status running task exectue 1 minutes timeout + + +########################################################################################## +# Prometheus config +########################################################################################## +prometheus_config = YAML_CONFIG.get_server_config().db.prometheus + +PROMETHEUS_DATABASE_URL = ( + f"prometheus://{prometheus_config.host}:{prometheus_config.port}" +) diff --git a/sysom_server/sysom_diagnosis/config.yml b/sysom_server/sysom_diagnosis/config.yml index 3186900c9439ac7440d7d4b9e4fed73aa711a7a5..9ffefa5ec6d8f5ebbc0cdff8e70fe7c747dce72e 100644 --- a/sysom_server/sysom_diagnosis/config.yml +++ b/sysom_server/sysom_diagnosis/config.yml @@ -1,6 +1,6 @@ vars: - SYSAK_DOWNLOAD_URL: &SYSAK_DOWNLOAD_URL https://mirrors.openanolis.cn/sysak/packages/release-v2.2.0/ - SYSAK_VERSION: &SYSAK_VERSION 2.2.0-1 + SYSAK_DOWNLOAD_URL: &SYSAK_DOWNLOAD_URL https://mirrors.openanolis.cn/sysak/packages/release-v2.4.0/ + SYSAK_VERSION: &SYSAK_VERSION 2.4.0-1 SERVICE_NAME: &SERVICE_NAME sysom_diagnosis SERVICE_CONSUMER_GROUP: !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] @@ -19,6 +19,11 @@ sysom_service: host: 127.0.0.1 bind: 127.0.0.1 port: 7002 + checkinterval: 60 + taskexecutetimeout: 10 + default_channel: ssh + ignore_tool_check_channels: + - offline framework: gcache: protocol: redis diff --git a/sysom_server/sysom_diagnosis/generate_cmds.py b/sysom_server/sysom_diagnosis/generate_cmds.py new file mode 100755 index 0000000000000000000000000000000000000000..e9ff37873dcb96971bf11abae6d7785acbe8d0c3 --- /dev/null +++ b/sysom_server/sysom_diagnosis/generate_cmds.py @@ -0,0 +1,200 @@ +import subprocess +import os +import json +import ast +import logging +import re +from typing import List, Optional, Type +from importlib import import_module +from service_scripts.base import DiagnosisTask, DiagnosisJob, DiagnosisPreProcessor + +logger = logging.getLogger("generate_cmds") + +def run_subprocess(cmd: List[str]) -> dict: + resp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return { + "stdout": resp.stdout.decode("utf-8"), + "stderr": resp.stderr.decode("utf-8"), + "returncode": resp.returncode, + } + +def preprocess_v1_async(service_name: str, params: dict) -> DiagnosisTask: + """ "Perform diagnosis preprocessing + + { + "commands":[ + { + "instance":"xxx", + "cmd":"xxx", + "params":{ => overide initial param + "region":"target_region" + } + } + ] + } + """ + # 2. Invoke preprocessing script(preprocessing script) + SCRIPTS_DIR = "./service_scripts" + service_path = os.path.join(SCRIPTS_DIR, service_name) + if not os.path.exists(service_path): + raise Exception("Can not find script file, please check service name") + try: + resp = run_subprocess([service_path, json.dumps(params)]) + except Exception as exc: + raise Exception(f"Execute preprocess script error: {str(exc)}") from exc + + # 3. If the preprocessing script executes with an error + if resp["returncode"] != 0: + raise (Exception(f"Execute preprocess script error: {resp['stderr']}")) + + # 4. If the preprocessing script executes successfully, + # take out the processing result + stdout = resp["stdout"] + resp = ast.literal_eval(stdout) + resp_scripts = resp.get("commands") + + # 5. If the preprocessing result not contains 'commands', it's a not expect bug + if not resp_scripts: + raise ( + Exception( + f"Not find commands, please check the preprocess script return" + ) + ) + diagnosis_task = DiagnosisTask( + jobs=[DiagnosisJob.from_dict(item) for item in resp_scripts], in_order=True + ) + + return diagnosis_task + +def preprocess_v2_async(service_name: str, params: dict) -> Optional[DiagnosisTask]: + """Pre-processing V2 + + Args: + instance (JobModel): JobModel + + Returns: + Optional[DiagnosisTask]: Diagnosis task + """ + + def _get_pre_processor(service_name: str) -> Type[DiagnosisPreProcessor]: + """ + 根据要执行的命令,动态引入一个 PreProcessor 的实现用于执行前处理 + """ + try: + return import_module(f"service_scripts.{service_name}_pre").PreProcessor + except Exception as e: + raise Exception(f"No Pre-processor available => {str(e)}") + + # 2. Use PreProcessor to check if the version of the tool meets the requirements + try: + params.pop("service_name", "") + pre_processor = _get_pre_processor(service_name)(service_name, **params) + except Exception as e: + return None + + # 3. Use PreProcessor to convert params to diagnosis jobs + diagnosis_task = pre_processor.get_diagnosis_cmds(params) + if diagnosis_task is None or len(diagnosis_task.jobs) == 0: + raise Exception(f"Pre-processor not return any diagnosis job") + + return diagnosis_task + +def preprocess_async(service_name: str, params: dict) -> Optional[DiagnosisTask]: + """ "Perform diagnosis preprocessing + { + "commands":[ + { + "instance":"xxx", + "cmd":"xxx", + "params":{ => overide initial param + "region":"target_region" + } + } + ] + } + """ + diagnosis_task: Optional[DiagnosisTask] = None + try: + diagnosis_task = preprocess_v2_async(service_name, params) + if diagnosis_task is None: + diagnosis_task = preprocess_v1_async(service_name, params) + except Exception as exc: + logger.exception(f"Diagnosis preprocess error: {str(exc)}, {service_name}, {params}") + return diagnosis_task + + +def get_trim_match_obj_value(match_obj, idx: int): + value = match_obj.group(idx) + if value is None: + return "" + else: + return value.strip() + +if __name__ == "__main__": + diagnosis_cmds = { + "filecache": {"instance": "127.0.0.1", "value": 1, "type": "all"}, + "iofsstat": {"instance": "127.0.0.1"}, + "iohang": {"instance": "127.0.0.1"}, + "iolatency": {"instance": "127.0.0.1"}, + "iosdiag_latency": {"instance": "127.0.0.1"}, + "jitter": {"instance": "127.0.0.1", "time": 1}, + "loadtask": {"instance": "127.0.0.1"}, + "memgraph": {"instance": "127.0.0.1"}, + "oomcheck": {"instance": "127.0.0.1"}, + "ossre": {"instance": "127.0.0.1"}, + "packetdrop": {"instance": "127.0.0.1", "time": 1}, + "pingtrace": {"origin_instance": "127.0.0.1", "pkg_num": 1, "time_gap": 1, "target_instance": "192.168.0.22"}, + "retran": {"instance": "127.0.0.1", "time": 1}, + "schedmoni": {"instance": "127.0.0.1"}, + "taskprofile": {"instance": "127.0.0.1", "timeout": 1} + } + for service_name, params in diagnosis_cmds.items(): + diagnosis_task = preprocess_async(service_name, params) + if diagnosis_task is not None: + for idx, job in enumerate(diagnosis_task.jobs): + match_obj = re.match(r'(.*)sysak (-g )*(memgraph|podmem|iofsstat|iosdiag|rtrace|loadtask|oomcheck|ossre_client|pingtrace|schedmoni)(.*?)(>>|&&|>|&)(.*)', job.cmd, re.I) + # 1 => sysak 之前的命令 + # 2 => -g or None + # 3 => sysak subcommand (memgraph|podmem|iofsstat|iosdiag|rtrace|loadtask|oomcheck|ossre_client|pingtrace|schedmoni) + # 4 => sysak subcommand 参数 + # 5 => >> | && | > | & + # 6 => sysak 之后的命令 + if match_obj: + sysak_pre = get_trim_match_obj_value(match_obj, 1) + sysak_g = get_trim_match_obj_value(match_obj, 2) + sysak_sub_cmd = get_trim_match_obj_value(match_obj, 3) + sysak_params = get_trim_match_obj_value(match_obj, 4) + sysak_sep = get_trim_match_obj_value(match_obj, 5) + sysak_post = get_trim_match_obj_value(match_obj, 6) + + + pre = sysak_pre + sysak_cmd = f"sysak{'' if not sysak_g else ' ' + sysak_g} {sysak_sub_cmd} {sysak_params}" + post = f"{sysak_sep} {sysak_post}" + + sub_func = "" + if service_name == "pingtrace": + sub_func = \ +f""" +sub_{service_name}_{"client" if "-c" in sysak_params else "server"}() {"{"} + {pre} sysak $@ {post} +{"}"} +""" + else: + sub_func = \ +f""" +sub_{service_name}() {"{"} + {pre} sysak $@ {post} +{"}"} + """ + print(sub_func) + # print(' '.join([ + # pre, + # sysak_cmd, + # post + # ])) + else: + raise Exception(f"{service_name} genrate failed") + + +# (.*)sysak (-g )*(memgraph|podmem|iofsstat|iosdiag|rtrace|loadtask|oomcheck|ossre_client|pingtrace|schedmoni)(.*?)(>|>>|&|&&)(.*) \ No newline at end of file diff --git a/sysom_server/sysom_diagnosis/lib/authentications.py b/sysom_server/sysom_diagnosis/lib/authentications.py index c1b49d2b31c9b293a0b63a8de6e34187b91c66c1..aca98c2b6538752c06cbe79b817fe0b32d0536ed 100644 --- a/sysom_server/sysom_diagnosis/lib/authentications.py +++ b/sysom_server/sysom_diagnosis/lib/authentications.py @@ -53,9 +53,7 @@ class TokenAuthentication(BaseAuthentication): raise exc # 判断用户是否已经手动注销登录 if SysomFramework.gcache("JWT_TOKEN").load(token) is None: - if is_local: - return {"id": 1, "token": "local"}, _ - else: + if not is_local: raise AuthenticationFailed('用户已退出登录!') payload['token'] = token diff --git a/sysom_server/sysom_diagnosis/lib/exception.py b/sysom_server/sysom_diagnosis/lib/exception.py index 16e4740e9eaa0d937ee291cc7a647ef2f2098f8c..f28524004f01cfc61bdc890a108e44ce3ca565e4 100644 --- a/sysom_server/sysom_diagnosis/lib/exception.py +++ b/sysom_server/sysom_diagnosis/lib/exception.py @@ -3,13 +3,32 @@ from clogger import logger from django.db.models import ProtectedError from rest_framework.views import set_rollback from rest_framework import exceptions -from rest_framework.exceptions import APIException as DRFAPIException, AuthenticationFailed, NotAuthenticated +from rest_framework.exceptions import ( + APIException as DRFAPIException, + AuthenticationFailed, + NotAuthenticated, +) from .response import ErrorResponse +class DiagnosisErrorCode: + PREPROCESS_ERROR = 1010 + PROPROCESS_CHECK_TOOL_VERSION_ERROR = 1011 + PREPROCESS_POST_WRAPPER_ERROR = 1020 + EXEC_ERROR = 1030 + POSTPROCESS_ERROR = 1040 + + +class DiagnosisException(Exception): + def __init__(self, code: int, message: str) -> None: + super().__init__(message) + self.code = code + self.message = message + + class APIException(Exception): - def __init__(self, code=400, message='API异常', args=('API异常',)): + def __init__(self, code=400, message="API异常", args=("API异常",)): self.code = code self.message = message self.args = args @@ -19,17 +38,17 @@ class APIException(Exception): class FileNotFoundException(Exception): - def __init__(self, code=404, message='文件不存在'): + def __init__(self, code=404, message="文件不存在"): self.code = code self.message = message def __str__(self): return self.message - + def exception_handler(exc, context): """自定义异常处理""" - msg = '' + msg = "" code = 400 if isinstance(exc, FileNotFoundException): diff --git a/sysom_server/sysom_diagnosis/scripts/node_init.sh b/sysom_server/sysom_diagnosis/scripts/node_init.sh index ad4ca82f990bc28092ed59eb819c5cb32f8b18c5..f9a5678de27dd3bd9427639bddd4837e6ba64b17 100755 --- a/sysom_server/sysom_diagnosis/scripts/node_init.sh +++ b/sysom_server/sysom_diagnosis/scripts/node_init.sh @@ -2,7 +2,7 @@ RESOURCE_DIR=${NODE_HOME}/${SERVICE_NAME} if [ "$SYSAK_VERTION" == "" ]; then - export SYSAK_VERTION=2.2.0-1 + export SYSAK_VERTION=2.4.0-1 fi if [ "$ARCH" == "" ]; then export ARCH=x86_64 diff --git a/sysom_server/sysom_diagnosis/scripts/node_update.sh b/sysom_server/sysom_diagnosis/scripts/node_update.sh index 397a1a0be8eca53c49d95b2a0fe6a04d5e7e2858..d6967c2ee18e2cf4adb17c4d1fe6da09bae119b9 100755 --- a/sysom_server/sysom_diagnosis/scripts/node_update.sh +++ b/sysom_server/sysom_diagnosis/scripts/node_update.sh @@ -2,7 +2,7 @@ RESOURCE_DIR=${NODE_HOME}/${SERVICE_NAME} if [ "$SYSAK_VERTION" == "" ]; then - export SYSAK_VERTION=2.2.0-1 + export SYSAK_VERTION=2.4.0-1 fi if [ "$ARCH" == "" ]; then export ARCH=x86_64 diff --git a/sysom_server/sysom_diagnosis/service_scripts/__init__.py b/sysom_server/sysom_diagnosis/service_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_diagnosis/service_scripts/base.py b/sysom_server/sysom_diagnosis/service_scripts/base.py index 9bd5b197b4a5fa4cb4a5dc159f1701f6e887c129..19fb9a07b7a2702fa44420b903537ff03f6e509b 100644 --- a/sysom_server/sysom_diagnosis/service_scripts/base.py +++ b/sysom_server/sysom_diagnosis/service_scripts/base.py @@ -7,6 +7,8 @@ Description: """ from abc import ABC, abstractmethod from typing import List, Optional, Union +from apps.task.models import JobModel +from asgiref.sync import sync_to_async class FileItem: @@ -19,7 +21,7 @@ class FileItem: return { "name": self.name, "remote_path": self.remote_path, - "local_path": self.local_path + "local_path": self.local_path, } @@ -44,7 +46,11 @@ class DiagnosisJob: return DiagnosisJob(instance=data.get("instance", ""), cmd=data.get("cmd", "")) def to_dict(self): - return {"instance": self.instance, "cmd": self.cmd, "fetch_file_list": [item.to_dict() for item in self.fetch_file_list]} + return { + "instance": self.instance, + "cmd": self.cmd, + "fetch_file_list": [item.to_dict() for item in self.fetch_file_list], + } class DiagnosisTask: @@ -191,3 +197,50 @@ class DiagnosisPostProcessor(DiagnosisProcessorBase): Args: results (List[DiagnosisResult]): Diagnosis results """ + + +class HookProcessResult: + def __init__(self, code: int, data: dict, err_msg: str = "") -> None: + self.code = code + self.data = data + self.err_msg = err_msg + + def to_dict(self): + return {"code": self.code, "data": self.data, "err_msg": self.err_msg} + + @classmethod + def from_dict(cls, data: dict) -> "HookProcessResult": + return HookProcessResult( + code=data.get("code", 1), + data=data.get("data", ""), + err_msg=data.get("err_msg", ""), + ) + + +class DiagnosisHookProcessor(DiagnosisProcessorBase): + """Hook-processor used to invoke hook scripts + + Args: + DiagnosisProcessorBase (_type_): _description_ + """ + + def __init__(self, service_name: str, **kwargs): + self.service_name = service_name + + async def save_job(self, instance: JobModel): + try: + await sync_to_async(instance.save)() + except Exception as e: + raise e + + @abstractmethod + async def invoke_hook(self, instance: JobModel, params: dict) -> HookProcessResult: + """Invoke hook scripts + + Args: + params (dict): Diagnosis parameters + + Returns: + HookProcessResult: Hook process result + """ + return HookProcessResult(code=200, data={}, err_msg="") diff --git a/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py b/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py index 4dd06d61cdd37721dfb29d54ce296763c07f3f97..4b71a0331dfb4f26960b29f4aa927f6903d0e2bc 100644 --- a/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py +++ b/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py @@ -80,5 +80,10 @@ class PostProcessor(DiagnosisPostProcessor): }) # table显示podmem结果 postprocess_result.result["podmem"] = {"data": datas} + postprocess_result.result["summary"] = {} + postprocess_result.result["summary"]['cause'] = ret["root_cause"] + postprocess_result.result["summary"]['suggestion'] = ret["suggestion"] + if ret["root_cause"] != "": + postprocess_result.result["summary"]['status'] = "warning" - return postprocess_result \ No newline at end of file + return postprocess_result diff --git a/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_post.py b/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_post.py new file mode 100644 index 0000000000000000000000000000000000000000..1526e1aea8e16132c761fa72909827d4581409e3 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_post.py @@ -0,0 +1,17 @@ +from typing import List +from .base import DiagnosisJobResult, DiagnosisPostProcessor, PostProcessResult +import json + + +class PostProcessor(DiagnosisPostProcessor): + def parse_diagnosis_result( + self, results: List[DiagnosisJobResult] + ) -> PostProcessResult: + preprocess_result = json.loads(results[0].stdout) + postprocess_result = PostProcessResult( + code=preprocess_result["code"], + err_msg=preprocess_result["err_msg"], + result=preprocess_result["result"], + ) + + return postprocess_result diff --git a/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_pre.py b/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_pre.py new file mode 100644 index 0000000000000000000000000000000000000000..25b520d2dc7f60102234c950b8ba6f6363ef972e --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/colocation_cpi_pre.py @@ -0,0 +1,701 @@ +from abc import abstractmethod +from datetime import datetime +import json +import traceback +from typing import List +from uuid import uuid4 +from clogger import logger + +import numpy as np +from metric_reader.metric_reader import MetricReader, dispatch_metric_reader +from metric_reader.result import MetricResult +from metric_reader.task import InstantQueryTask, RangeQueryTask + +from .base import DiagnosisJob, DiagnosisPreProcessor, DiagnosisTask + + +STATUS_NORMAL = "NORMAL" +STATUS_WARNING = "WARNING" +STATUS_ERROR = "ERROR" +DEFAULT_EMPTY_STR = "N/A" + +REASON_LLC = "LLC Miss 频发" +REASON_MBW = "内存带宽延迟过高" + +SOURCE_LS = "在线业务" +SOURCE_BE = "离线业务" + + +class QueryError(Exception): + pass + + +class TimeValueError(Exception): + pass + + +class Diagnose: + def __init__(self, check_name: str, reader: MetricReader) -> None: + self._check_name = check_name + self._status = STATUS_NORMAL + self._result = "未检测到异常" + self._suggestion = DEFAULT_EMPTY_STR + self._reader = reader + + def get_overview(self) -> dict: + return { + "data": f"- **结论**: {self._result}\n- **修复建议**: {self._suggestion}\n\n" + } + + @abstractmethod + def diagnose(self) -> None: + pass + + +class ContainerCpiRecord: + def __init__(self, namespace: str, pod: str, container: str) -> None: + # analyze result + self._namespace = namespace + self._pod = pod + self._container = container + self._err_count = 0 + self._err_core_reason = DEFAULT_EMPTY_STR + self._err_core_source = DEFAULT_EMPTY_STR + self._level = DEFAULT_EMPTY_STR + # base data + self._cpi = [] + self._llc_miss_rate = [] + self._err_bitmap = [] + self._imc_latency = [] + self._llc_dist = [] + self._mbw_dist = [] + # component data + self._nr_llc = 0 + self._nr_mbw = 0 + self._nr_ls = 0 + self._nr_be = 0 + + def record( + self, + cpi: float, + llc_miss_rate: float, + err: bool, + latency: float, + llc_dist: float, + mbw_dist: float, + ) -> None: + self._cpi.append(cpi) + self._llc_miss_rate.append(llc_miss_rate) + self._err_bitmap.append(err) + self._imc_latency.append(latency) + self._llc_dist.append(llc_dist) + self._mbw_dist.append(mbw_dist) + if err: + self._err_count += 1 + + def analyze_level(self) -> None: + # 分析告警等级 + if self._err_count < 3: + self._level = "NORMAL" + elif self._err_count < 5: + self._level = "WARNING" + else: + self._level = "ERROR" + + def analyze_reason(self) -> None: + err_avg_rate = 0 + normal_avg_rate = 0 + err_avg_latency = 0 + normal_avg_latency = 0 + nr_err = self._err_count + nr_normal = len(self._err_bitmap) - self._err_count + if nr_err == 0 or nr_normal == 0: + return + + for err, miss_rate, latency in zip( + self._err_bitmap, self._llc_miss_rate, self._imc_latency + ): + if err: + err_avg_rate += miss_rate + err_avg_latency += latency + else: + normal_avg_rate += miss_rate + normal_avg_latency += latency + err_avg_rate = err_avg_rate / nr_err + err_avg_latency = err_avg_latency / nr_err + normal_avg_rate = normal_avg_rate / nr_normal + normal_avg_latency = normal_avg_latency / nr_normal + + # TODO 依赖于当前五分钟的数据 如果全是异常 则无法分析 待改成过去一天的平均延迟 + if normal_avg_rate == 0 or normal_avg_latency == 0: + return + + # 分析每一次CPI异常的具体原因 + for err, miss_rate, latency, l3_occ_rate, mbw_rate in zip( + self._err_bitmap, + self._llc_miss_rate, + self._imc_latency, + self._llc_dist, + self._mbw_dist, + ): + if not err: + continue + + # 根据missrate和内存延迟的增幅判断主要原因 + if (miss_rate - normal_avg_rate) / normal_avg_rate > ( + latency - normal_avg_latency + ) / normal_avg_latency: + + self._nr_llc += 1 + # 分析miss rate为主要原因时的干扰源 + if l3_occ_rate != float("inf") and l3_occ_rate > 0.5: + self._nr_ls += 1 + else: + self._nr_be += 1 + else: + self._nr_mbw += 1 + if mbw_rate != float("inf") and mbw_rate > 0.5: + self._nr_ls += 1 + else: + self._nr_be += 1 + + # 根据统计结果确定主要异常原因 + if self._nr_llc > self._nr_mbw: + self._err_core_reason = REASON_LLC + else: + self._err_core_reason = REASON_MBW + + # 根据统计结果确定主要干扰源 + if self._nr_ls > self._nr_be: + self._err_core_source = SOURCE_LS + else: + self._err_core_source = SOURCE_BE + + def analyze(self) -> None: + self.analyze_level() + self.analyze_reason() + + def report(self) -> dict: + return { + "key": str(uuid4()), + "容器": f"{self._namespace}-{self._pod}-{self._container}", + "干扰次数": self._err_count, + "干扰等级": self._level, + "主要干扰原因": self._err_core_reason, + "干扰源": self._err_core_source, + } + + def reason(self) -> str: + return self._err_core_reason + + def err_count(self) -> int: + return self._err_count + + def level(self) -> str: + return self._level + + def source(self) -> str: + return self._err_core_source + + +class ContainerTable: + def __init__(self) -> None: + self._container_table = {} + self._nr_err = 0 + self._nr_err_container = 0 + self._nr_llc_miss = 0 + self._nr_mbw = 0 + + self._status = DEFAULT_EMPTY_STR + self._suggestion = DEFAULT_EMPTY_STR + self._main_reason = DEFAULT_EMPTY_STR + self._main_source = DEFAULT_EMPTY_STR + + def record( + self, + namespace: str, + pod: str, + container: str, + cpi: float, + llc_miss_rate: float, + err: bool, + latency: float, + llc_dist: float, + mbw_dist: float, + ) -> None: + key = f"{namespace}-{pod}-{container}" + if key not in self._container_table.keys(): + self._container_table[key] = ContainerCpiRecord(namespace, pod, container) + self._container_table[key].record( + cpi, llc_miss_rate, err, latency, llc_dist, mbw_dist + ) + + def report(self) -> List[dict]: + ret = [] + for record in self._container_table.values(): + if record.err_count() <= 3: + continue + ret.append(record.report()) + return ret + + def analyze(self) -> None: + for record in self._container_table.values(): + if record.err_count() <= 3: + continue + self._nr_err_container += 1 + self._nr_err += record.err_count() + record.analyze() + self.analyze_main_reason() + self.analyze_status() + self.analyze_source() + + def analyze_main_reason(self) -> None: + stat = {} + for record in self._container_table.values(): + if record.err_count() <= 3: + continue + + if record.reason() not in stat.keys(): + stat[record.reason()] = 0 + stat[record.reason()] += 1 + + self._main_reason = DEFAULT_EMPTY_STR + main_count = 0 + for reason, count in stat.items(): + if count > main_count: + self._main_reason = reason + main_count = count + + def analyze_status(self) -> None: + # return the max level as the summary + stat = {} + for record in self._container_table.values(): + if record.err_count() <= 3: + continue + if record.level() not in stat.keys(): + stat[record.level()] = 0 + stat[record.level()] += 1 + + if stat.get(STATUS_ERROR, None) is not None: + self._status = STATUS_ERROR + elif stat.get(STATUS_WARNING, None) is not None: + self._status = STATUS_WARNING + elif stat.get(STATUS_NORMAL, None) is not None: + self._status = STATUS_NORMAL + else: + self._status = DEFAULT_EMPTY_STR + + def analyze_source(self) -> None: + stat = {} + for record in self._container_table.values(): + if record.err_count() <= 3: + continue + if record.source() not in stat.keys(): + stat[record.source()] = 0 + stat[record.source()] += 1 + self._main_source = DEFAULT_EMPTY_STR + main_count = 0 + for reason, count in stat.items(): + if count > main_count: + self._main_source = reason + main_count = count + + def result(self) -> str: + if self._status == STATUS_NORMAL or self._status == DEFAULT_EMPTY_STR: + return f"未检测到异常" + else: + return f"诊断时刻存在CPI干扰,主要原因为{self._main_reason},干扰的主要源头为{self._main_source}" + + def status(self) -> str: + return self._status + + def suggestion(self) -> str: + + if self._status == STATUS_NORMAL or self._status == DEFAULT_EMPTY_STR: + return DEFAULT_EMPTY_STR + else: + return ( + f"建议通过resctrl文件系统调整LLC和内存带宽分配策略或者改变Pod部署策略" + ) + + +class CpiSeries: + def __init__(self) -> None: + self._series = {} + self.llc_occ = [] + self.mbw = [] + + def record(self, timestamp: float, err_cnt: int) -> None: + if timestamp not in self._series.keys(): + self._series[timestamp] = 0 + self._series[timestamp] += err_cnt + + def attach(self, llc_occ: List[float], mbw: List[float]) -> None: + self.llc_occ = llc_occ + self.mbw = mbw + + def report(self) -> List[dict]: + ts_series = [ts for ts, _ in sorted(self._series.items())] + err_series = [err for _, err in sorted(self._series.items())] + combined = zip(ts_series, err_series, self.llc_occ, self.mbw) + return [ + { + "time": datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S"), + "CPI异常次数": err, + "在线业务-LLC占比": llc, + "在线业务-内存带宽占比": mbw, + } + for ts, err, llc, mbw in combined + ] + + +class CpiDiagnose(Diagnose): + + def __init__(self, reader: MetricReader) -> None: + super().__init__("CPI干扰", reader) + self._series = CpiSeries() + self._table = ContainerTable() + + def _calculate_rate(self, ls_data: dict, be_data: dict) -> List[float]: + ls_matrix = [arr for arr in ls_data.values()] + be_matrix = [arr for arr in be_data.values()] + + np_ls_vectors = np.array(ls_matrix) + np_be_vectors = np.array(be_matrix) + # 按列求和 + np_ls_sum_vector = np_ls_vectors.sum(axis=0) + np_be_sum_vector = np_be_vectors.sum(axis=0) + + # 计算总和 + np_sum_vector = np_ls_sum_vector + np_be_sum_vector + epsilon = 0.001 + safe_sum_vector = np.where( + np.abs(np_sum_vector) < epsilon, epsilon, np_sum_vector + ) + data = np_ls_sum_vector / safe_sum_vector + # process the divide zero + return data.tolist() + + def _query_5min_imc_data(self, instance: str, dt: datetime) -> List[float]: + ts = dt.timestamp() + task = ( + RangeQueryTask( + "sysom_imc_event_node", start_time=int(ts - 300), end_time=int(ts) + ) + .append_equal_filter("exported_instance", instance) + .append_equal_filter("value", "rlat") + ) + + result = self._reader.range_query([task]) + if result.code != 0: + raise QueryError( + f"Query sysom_imc_event_node Failed. args: instance={instance} result: code={result.code} msg=({result.err_msg})" + ) + + if len(result.data) == 0: + raise QueryError( + f"Query sysom_imc_event_node empty. args: instance={instance} start={ts-300} end={ts}" + ) + latency = [float(val[1]) for val in result.data[0].values] + return latency + + def _query_5min_rdt_data(self, instance: str, dt: datetime) -> dict: + """Query 5min rdt metric data. + + Args: + instance (str): instance ip + dt (datetime): query the 5min data before `dt` + + Raises: + QueryError: if query failed or no data. + + Returns: + dict: {'llc_occ': [float, float, ...], 'mbw': [float, float, ...]} + """ + ts = dt.timestamp() + task = ( + RangeQueryTask( + "sysom_rdt_usage", start_time=int(ts - 300), end_time=int(ts) + ) + .append_equal_filter("exported_instance", instance) + .append_wildcard_filter("value", "llc_occ|total_mem_bw") + ) + + result = self._reader.range_query([task]) + if result.code != 0: + raise QueryError( + f"Query sysom_rdt_usage Failed. args: instance={instance} result: code={result.code} msg=({result.err_msg})" + ) + data = { + "LS": {"llc_occ": {}, "total_mem_bw": {}}, + "BE": {"llc_occ": {}, "total_mem_bw": {}}, + } + """ the path follow format: sys/fs/resctrl/{tag}/mon_data/mon_L3_{socket} + tag: BE or LS + socket: 00, 01 ... + """ + + for item in result.data: + labels = item.to_dict()["labels"] + path = str(labels["path"]) + entries = path.split("/") + if len(entries) != 6 or entries[4] != "mon_data": + continue + tag = entries[3] + if tag != "LS" and tag != "BE": + continue + + socket_tag = entries[5] + data[tag][labels["value"]][socket_tag] = [ + float(val[1]) for val in item.to_dict()["values"] + ] + + llc_occ_rate = self._calculate_rate( + data["LS"]["llc_occ"], data["BE"]["llc_occ"] + ) + mbw_rate = self._calculate_rate( + data["LS"]["total_mem_bw"], data["BE"]["total_mem_bw"] + ) + data = {"llc_occ_rate": llc_occ_rate, "mbw_rate": mbw_rate} + return {"llc_occ_rate": llc_occ_rate, "mbw_rate": mbw_rate} + + def _querty_ls_miss_rate(self, instance: str, dt: datetime) -> List[float]: + # get the miss rate between ls and be + ts = dt.timestamp() + task = ( + RangeQueryTask( + "sysom_container_pmu_events", start_time=ts - 300, end_time=ts + ) + .append_equal_filter("exported_instance", instance) + .append_wildcard_filter("value", "llcStoreMis|llcLoadMis") + ) + + result = self._reader.range_query([task]) + + if result.code != 0: + raise QueryError( + f"Query sysom_container_pmu_events Failed. args: instance={instance} result: code={result.code} msg=({result.err_msg})" + ) + if len(result.data) == 0: + raise QueryError( + f"Query sysom_container_pmu_events data is empty. args: instance={instance})" + ) + ls_vectors = [] + be_vectors = [] + + for item in result.data: + labels = item.to_dict()["labels"] + tag = labels["bvt"] + nr_miss = [float(val[1]) for val in item.values] + if tag == "LS": + ls_vectors.append(nr_miss) + else: + be_vectors.append(nr_miss) + + np_ls_vectors = np.array(ls_vectors) + np_be_vectors = np.array(be_vectors) + np_ls_sum_vector = np_ls_vectors.sum(axis=0) + np_be_sum_vector = np_be_vectors.sum(axis=0) + np_sum_vector = np_ls_sum_vector + np_be_sum_vector + # process the divide zero + data = np.where( + np_sum_vector != 0, np_ls_vectors / np_sum_vector, np.nan + ).tolist() + return data + + def _query_5min_pmu_data(self, instance: str, dt: datetime) -> dict: + """_summary_ + + Args: + instance (str): _description_ + dt (datetime): _description_ + + Raises: + QueryError: _description_ + + Returns: + dict: {'ns': str, 'pod': str, 'con': str, 'cpi: [...], 'lsMisRate' : [...]} + """ + ts = dt.timestamp() + container_list = {} + + task = ( + RangeQueryTask( + "sysom_container_pmu_events", start_time=ts - 300, end_time=ts + ) + .append_equal_filter("exported_instance", instance) + .append_equal_filter("bvt", "LS") + .append_wildcard_filter("value", "CPI|l3MisRate") + ) + + result = self._reader.range_query([task]) + + if result.code != 0: + raise QueryError( + f"Query sysom_container_pmu_events Failed. args: instance={instance} result: code={result.code} msg=({result.err_msg})" + ) + for item in result.data: + labels = item.to_dict()["labels"] + key = f"{labels['namespace']}:{labels['pod']}:{labels['container']}" + + if labels["container"] != "None" and key not in container_list.keys(): + container_list[key] = { + "ns": labels["namespace"], + "pod": labels["pod"], + "con": labels["container"], + } + container_list[key][labels["value"]] = item.values + return container_list + + def _parse_aggregate_result(self, result: MetricResult) -> float: + if result.code != 0: + raise QueryError( + f"Query stddev or mean failed. err_code={result.code} msg={result.err_msg}" + ) + if len(result.data) == 0: + raise QueryError( + f"Query stddev or mean no data. Please check metric table." + ) + + return float(result.data[0].to_dict()["value"][1]) + + def _query_yesterday_mean( + self, instance: str, dt: datetime, namespace: str, pod: str, container: str + ) -> float: + return self._query_yesterday_agg( + instance, dt, namespace, pod, container, "avg_over_time" + ) + + def _query_yesterday_stddev( + self, instance: str, dt: datetime, namespace: str, pod: str, container: str + ) -> float: + return self._query_yesterday_agg( + instance, dt, namespace, pod, container, "stddev_over_time" + ) + + def _query_yesterday_agg( + self, + instance: str, + dt: datetime, + namespace: str, + pod: str, + container: str, + agg: str, + ) -> float: + zero_ts = dt.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() + task = ( + InstantQueryTask("sysom_container_pmu_events", zero_ts, agg, "1d") + .append_equal_filter("exported_instance", instance) + .append_equal_filter("namespace", namespace) + .append_equal_filter("pod", pod) + .append_equal_filter("container", container) + .append_equal_filter("value", "CPI") + ) + result = self._reader.instant_query([task]) + return self._parse_aggregate_result(result) + + def diagnose(self, instance: str, dt: datetime) -> None: + container_list = self._query_5min_pmu_data(instance, dt) + imc_lat_list = self._query_5min_imc_data(instance, dt) + rdt_data = self._query_5min_rdt_data(instance, dt) + for item in container_list.values(): + cpi_avg = self._query_yesterday_mean( + instance, dt, item["ns"], item["pod"], item["con"] + ) + cpi_stddev = self._query_yesterday_stddev( + instance, dt, item["ns"], item["pod"], item["con"] + ) + + for cpi, l3_miss_rate, imc_lat, occ_rate, mbw_rate in zip( + item["CPI"], + item["l3MisRate"], + imc_lat_list, + rdt_data["llc_occ_rate"], + rdt_data["mbw_rate"], + ): + ts = float(cpi[0]) + cpi_val = float(cpi[1]) + miss_val = float(l3_miss_rate[1]) + err = cpi_val != 0 and cpi_val >= cpi_avg + 2 * cpi_stddev + if err: + # 2. 时间序列累计每个时刻的异常次数 + self._series.record(timestamp=ts, err_cnt=1) + else: + self._series.record(timestamp=ts, err_cnt=0) + + # 3. 容器维度累计容器的异常次数 同时分析异常时刻的异常原因是llc miss rate还是内存延迟 + self._table.record( + namespace=item["ns"], + pod=item["pod"], + container=item["con"], + cpi=cpi_val, + llc_miss_rate=miss_val, + err=err, + latency=imc_lat, + llc_dist=occ_rate, + mbw_dist=mbw_rate, + ) + self._series.attach(rdt_data["llc_occ_rate"], rdt_data["mbw_rate"]) + self._table.analyze() + self._status = self._table.status() + self._suggestion = self._table.suggestion() + self._result = self._table.result() + + def get_timeseries(self) -> dict: + return {"data": self._series.report()} + + def get_container_table(self) -> dict: + return {"data": self._table.report()} + + +def diagnose(instance: str, dt: datetime) -> dict: + result = {} + metric_reader = dispatch_metric_reader("prometheus://localhost:9090") + cpi_diag = CpiDiagnose(metric_reader) + cpi_diag.diagnose(instance, dt) + + result["overview"] = cpi_diag.get_overview() + result["container-table"] = cpi_diag.get_container_table() + result["disturb-timeseries"] = cpi_diag.get_timeseries() + return result + + +def validate_time(time_str: str) -> datetime: + try: + return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") + except ValueError: + raise TimeValueError(f"Time format error. time_str={time_str}") + + +class PreProcessor(DiagnosisPreProcessor): + """Command diagnosis + + Just invoke command in target instance and get stdout result + + Args: + DiagnosisPreProcessor (_type_): _description_ + """ + + def get_diagnosis_cmds(self, params: dict) -> DiagnosisTask: + result = {"code": 0, "err_msg": "", "result": {}} + try: + instance = params.get("instance", "") + # process host:port, we only use host + instance = str(instance).split(":")[0] + time_str = params.get("moment", "") + dt = datetime.now() if time_str == "" else validate_time(time_str) + result["result"] = diagnose(instance, dt) + except Exception as e: + logger.error(f"Diagnose error. err={e}") + traceback.print_exc() + result = { + "code": 1, + "err_msg": f"{str(e)}\n解决方法:请检查是否是混部场景、是否支持硬件指标采集、是否开启SysAK对应插件", + "result": {}, + } + finally: + return DiagnosisTask( + jobs=[DiagnosisJob(instance="", cmd="")], + offline_mode=True, + offline_results=[json.dumps(result)], + ) diff --git a/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_post.py b/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_post.py new file mode 100644 index 0000000000000000000000000000000000000000..1526e1aea8e16132c761fa72909827d4581409e3 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_post.py @@ -0,0 +1,17 @@ +from typing import List +from .base import DiagnosisJobResult, DiagnosisPostProcessor, PostProcessResult +import json + + +class PostProcessor(DiagnosisPostProcessor): + def parse_diagnosis_result( + self, results: List[DiagnosisJobResult] + ) -> PostProcessResult: + preprocess_result = json.loads(results[0].stdout) + postprocess_result = PostProcessResult( + code=preprocess_result["code"], + err_msg=preprocess_result["err_msg"], + result=preprocess_result["result"], + ) + + return postprocess_result diff --git a/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_pre.py b/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_pre.py new file mode 100644 index 0000000000000000000000000000000000000000..f9ebafd09fa8c31ed846bf9211f901411ac9630a --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/colocation_serveutil_pre.py @@ -0,0 +1,196 @@ +from datetime import datetime +import json +import traceback +from typing import List +from uuid import uuid4 +from clogger import logger +from metric_reader.metric_reader import dispatch_metric_reader +from metric_reader.task import InstantQueryTask +from .base import DiagnosisJob, DiagnosisPreProcessor, DiagnosisTask + +# TODO data query use exported instance, only contain host, so if occur one host multi prometheus client will be wrong + + +def validate_time(time_str: str) -> datetime: + try: + return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") + except ValueError: + raise Exception(f"Time format error. time_str={time_str}") + + +class ServeUtilDiagnose: + def __init__(self, instance: str, dt: datetime): + self._instance = instance + self._dt = dt + self._ts = dt.timestamp() + self._metric_reader = dispatch_metric_reader("prometheus://localhost:9090") + self._container_table = [] + self._internal_err = 0 + + self._status = "N/A" + self._cause = "N/A" + self._suggestion = "N/A" + + def _query_all_container(self) -> List[dict]: + task = ( + InstantQueryTask("sysom_container_cfs_statis", self._dt.timestamp()) + .append_equal_filter("exported_instance", self._instance) + .append_equal_filter("bvt", "LS") + .append_wildcard_filter("value", "qother|qslibling|serveutil") + ) + result = self._metric_reader.instant_query([task]) + if result.code != 0: + raise Exception( + f"Query sysom_container_cfs_statis all container data filed. code={result.code} err_msg={result.err_msg}" + ) + container_list = {} + for item in result.data: + lables = item.labels + key = f"{lables['namespace']}-{lables['pod']}-{lables['container']}" + data = item.value + + if lables["container"] != "None" and key not in container_list: + container_list[key] = { + "ns": lables["namespace"], + "pod": lables["pod"], + "con": lables["container"], + "serveutil": 0, + } + container_list[key][lables["value"]] = float(data[1]) + + return [val for val in container_list.values()] + + def _query_serve_util_rate(self, ns: str, pod: str, con: str, end: float) -> float: + task = ( + InstantQueryTask( + "sysom_container_cfs_statis", + end, + aggregation="quantile_over_time", + interval="7d", + aggregation_val="0.01", + ) + .append_equal_filter("exported_instance", self._instance) + .append_equal_filter("namespace", ns) + .append_equal_filter("pod", pod) + .append_equal_filter("container", con) + .append_equal_filter("bvt", "LS") + .append_equal_filter("value", "serveutil") + ) + result = self._metric_reader.instant_query([task]) + if result.code != 0: + raise Exception( + f"Query sysom_container_cfs_statis p99 serveutil data failed. code={result.code} err_msg={result.err_msg}" + ) + if len(result.data) == 0: + raise Exception(f"Query sysom_container_cfs_statis p99 serveutil empty.") + return float(result.data[0].value[1]) + + def diagnose(self) -> None: + container_list = self._query_all_container() + for container in container_list: + cur_serveutil = container["serveutil"] + # if no exec time skip it + if cur_serveutil == 0: + continue + p99_serveutil = self._query_serve_util_rate( + container["ns"], container["pod"], container["con"], self._ts + ) + if cur_serveutil < p99_serveutil: + qother = container["qother"] + qslibing = container["qslibling"] + self._container_table.append( + { + "key": str(uuid4()), + "容器": f"{container['ns']}-{container['pod']}-{container['con']}", + "CFS满足率(current/p99)": f"{cur_serveutil}/{p99_serveutil}", + "等待时间分布(other/slibing)": f"{qother}/{qslibing}", + "主要干扰原因": ( + "Pod外部容器" if qother >= qslibing else "Pod内部容器" + ), + } + ) + if qother < qslibing: + self._internal_err += 1 + if len(self._container_table) != 0: + self._status = "ERROR" + self._cause = ( + "Pod内部" + if self._internal_err > len(self._container_table) - self._internal_err + else "Pod外部" + ) + self._suggestion = "对存在外部容器干扰的Pod调大CPU资源配额,对存在内部容器干扰的Pod调整容器数量" + else: + self._status = "NORMAL" + + def container_table(self) -> List[dict]: + return self._container_table + + def overview(self) -> str: + result = "系统无异常状态" + + if len(self._container_table) != 0: + result = f"当前系统状态异常,有{len(self._container_table)}个受干扰容器,主要干扰来源于{self._cause}" + return f"- **结论**:{result}。\n- **修复建议**:{self._suggestion}\n" + + def status(self) -> str: + return self._status + + def suggestion(self) -> str: + return self._suggestion + + def cause(self) -> str: + return self._cause + + +class PreProcessor(DiagnosisPreProcessor): + """Command diagnosis + + Just invoke command in target instance and get stdout result + + Args: + DiagnosisPreProcessor (_type_): _description_ + """ + + def get_diagnosis_cmds(self, params: dict) -> DiagnosisTask: + result = {"code": 0, "err_msg": "", "result": {}} + try: + instance = params.get("instance", "") + # process host:port, we only use host + instance = str(instance).split(":")[0] + + time_str = params.get("moment", "") + dt = datetime.now() if time_str == "" else validate_time(time_str) + diagnsoe = ServeUtilDiagnose(instance, dt) + diagnsoe.diagnose() + result["result"] = { + "summary": { + "status": diagnsoe.status(), + "cause": diagnsoe.cause(), + "suggestion": diagnsoe.suggestion(), + }, + "overview": {"data": diagnsoe.overview()}, + "container-table": {"data": diagnsoe.container_table()}, + } + + except Exception as e: + logger.error(f"ServeUtil Diagnose failed. err={e}") + traceback.print_exc() + result = { + "code": 1, + "err_msg": f"{str(e)}\n 解决方法:请检查是否是混部场景、是否支持硬件指标采集、是否开启SysAK对应插件", + "result": { + "summary": { + "status": "normal", + "cause": "N/A", + "suggestion": "N/A", + }, + "overview": {"data": ""}, + "container-table": {"data": []}, + }, + } + finally: + return DiagnosisTask( + jobs=[DiagnosisJob(instance="", cmd="")], + offline_mode=True, + offline_results=[json.dumps(result)], + ) diff --git a/sysom_server/sysom_diagnosis/service_scripts/command_hook.py b/sysom_server/sysom_diagnosis/service_scripts/command_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..f43033ca67ba9be9eccfbe76c1a637cb44b1f89d --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/command_hook.py @@ -0,0 +1,36 @@ +""" +Time 2023/06/19 17:32 +Author: mingfeng (SunnyQjm) +Email mfeng@linux.alibaba.com +File command.py +Description: +""" +import json +from apps.task.models import JobModel +from clogger import logger +from .base import HookProcessResult, DiagnosisHookProcessor + +class HookProcessor(DiagnosisHookProcessor): + """Hook-processor used to invoke hook scripts + + Args: + DiagnosisProcessorBase (_type_): _description_ + """ + + async def invoke_hook(self, instance: JobModel, params: dict) -> HookProcessResult: + """Invoke hook scripts + + Args: + params (dict): Diagnosis parameters + + Returns: + HookProcessResult: Hook process result + """ + replace_text = params.get("replace_text", "") + + origin_result = json.loads(instance.result) + origin_result["CommandResult"]["data"][0] = {"key": "", "value": replace_text} + + instance.result = json.dumps(origin_result) + await self.save_job(instance) + return HookProcessResult(code=200, data={}, err_msg="") \ No newline at end of file diff --git a/sysom_server/sysom_diagnosis/service_scripts/filecache_post b/sysom_server/sysom_diagnosis/service_scripts/filecache_post index 184d4163256a26b7696fecef81386270c016d031..da65e1e7e6d68e647ecf1bb1854b86397f8bb348 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/filecache_post +++ b/sysom_server/sysom_diagnosis/service_scripts/filecache_post @@ -22,7 +22,10 @@ def filecache_result(raw): cache_type = rawdata['type'] podid = cache_type data = rawdata['data'] - cache['summary'] = "success" + cache['summary'] = {} + cache['summary']['status'] = "normal" + cache['summary']['cause'] = "Just show filecache" + cache['summary']['suggestion'] = "Show top 10 cached of files" for key, _value in data.items(): for value in _value: if len(value['sort_file']) == 0: diff --git a/sysom_server/sysom_diagnosis/service_scripts/iodiagnose b/sysom_server/sysom_diagnosis/service_scripts/iodiagnose new file mode 100755 index 0000000000000000000000000000000000000000..af36b61e9ab8b95aa18d16645b79fdc9325daa0c --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/iodiagnose @@ -0,0 +1,45 @@ +#!/usr/bin/python3 +# coding=utf-8 +import json +import sys + +############################################################################### +## 如前端入参是这个: +## { +## "instance":"192.168.1.101", +## "timeout":"30" +## } +############################################################################### +class Param(dict): + def __missing__(self,key): + sys.stderr.write("入参检查失败,没有检查到入参'%s'"%key) + exit(1) + +args = Param(json.loads(sys.argv[1])) + +result = {} +result['commands'] = [] + +cmd0 = {} +cmd0['instance'] = args["instance"] +cmd0_arg_T = args.get("timeout","30") +cmd0_arg_yaml = '/etc/sysak/base.yaml' +cmd0_arg_log = '/var/log/sysak/iosdiag/iodiagnose/iodiagnose.log' +if int(cmd0_arg_T) <= 0: + cmd0_arg_T = 30 + +dump_log_cmd = "cat /var/log/sysak/iosdiag/iodiagnose/iodiagnose.log 2>/dev/null;" +iosdiag_cmd = "sysak ioMonitor -y "+cmd0_arg_yaml+" -t "+str(cmd0_arg_T)+" -a "+cmd0_arg_log+" -d > /dev/null" +print_result_cmd = ( + "if [ ! -e {logfile} ]; then " + "echo \"normal\"; " + "else " + "{dumpcmd} " + "fi" +).format(logfile=cmd0_arg_log, dumpcmd=dump_log_cmd) +cmd0['cmd'] = iosdiag_cmd+" && "+print_result_cmd + +result['commands'].append(cmd0) + +data = json.dumps(result) +print(data) diff --git a/sysom_server/sysom_diagnosis/service_scripts/iodiagnose_post b/sysom_server/sysom_diagnosis/service_scripts/iodiagnose_post new file mode 100755 index 0000000000000000000000000000000000000000..219826eefc30e2b3ec629bff51c7e20ab69f0d6c --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/iodiagnose_post @@ -0,0 +1,68 @@ +#!/usr/bin/python3 +# coding=utf-8 +import sys +import json +import re + +def number_to_letter(match): + return '\n{}. '.format(chr(int(match.group(1)) + 96)) +def iosdiagJoinData(raw): + postprocess_result = { + "code": 0, + "err_msg": "", + "result": {} + } + reasonList, suggestList = [], [] + + stat = {} + data = [] + stat["iodiagnoseOverview"] = { + "data": [{'Check Result': "normal", "IOs problem detected": 0}]} + stat["summary"] = {"status": "normal", "cause": "", + "suggestion": ""} + if raw.startswith("normal"): + postprocess_result['result'] = stat + s = json.dumps(postprocess_result, indent=4) + print(s) + return + diagnose_list = raw.split('\n') + num = 0 + for i, s in enumerate(diagnose_list): + try: + obj = json.loads(s) + data.append({ + "diag_type": obj['diag_type'], + 'devname': obj['devname'], + 'diagret': obj['diagret'], + 'reason': obj['reason'], + 'solution': obj['solution'] + }) + num += 1 + reasonList.append(" ".join([obj['diag_type'], re.sub(r"(\d+)\. ", + number_to_letter, obj['reason'])])) + suggestList.append(obj['diag_type'] + ' \n' +obj['solution'] + ';') + except Exception: + continue + stat["iodiagnoseOverview"] = { + "data": [{'Check Result': "abnormal", "IOs problem detected": num}]} + stat["iodiagnoseDetail"] = {"data": data} + stat["summary"]["status"] = "warning" + stat["summary"]["cause"] = "\n".join(["{}. {}".format(i+1, reason) \ + for i, reason in enumerate(reasonList)]) + stat["summary"]["suggestion"] = " \n".join(["{}. {}".format(i+1, suggest) \ + for i, suggest in enumerate(suggestList)]) + postprocess_result['result'] = stat + s = json.dumps(postprocess_result, indent=4) + print(s) + + +def extract_params(): + path, res, task_id = sys.argv[1], "", sys.argv[2] + with open(path, 'r') as tmp: + res = tmp.read() + return res, task_id + + +if __name__ == "__main__": + res, _ = extract_params() + iosdiagJoinData(res) diff --git a/sysom_server/sysom_diagnosis/service_scripts/iofsstat_post b/sysom_server/sysom_diagnosis/service_scripts/iofsstat_post index a7ca294f6b883cc830d20dc33376c923a7a0c208..cbfe6a3325bb2dc5c5d7ac7a2c840d211825cd08 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/iofsstat_post +++ b/sysom_server/sysom_diagnosis/service_scripts/iofsstat_post @@ -158,14 +158,17 @@ def iofsstatJoinData(raw): stat["disks"] = {"data": []} stat["overview"] = {"data": []} stat["overview"]["data"] = dataToSummary(raw) - stat["summary"] = "diagnose results: No IO traffic detected" + stat["summary"] = {"status": "normal", "cause": "", "suggestion": ""} if stat["overview"]["data"]: - stat["summary"] = \ - "diagnose results: %s, caused by \n%ssolution:%s" % ( - stat["overview"]["data"][0]['result'], - stat["overview"]["data"][0]['reason'], - stat["overview"]["data"][0]['solution'], - ) + stat["summary"]["status"] = "warning" + stat["summary"]["cause"] = stat["overview"]["data"][0]['reason'] + stat["summary"]["suggestion"] = stat["overview"]["data"][0]['solution'] + # stat["summary"] = \ + # "diagnose results: %s, caused by \n%ssolution:%s" % ( + # stat["overview"]["data"][0]['result'], + # stat["overview"]["data"][0]['reason'], + # stat["overview"]["data"][0]['solution'], + # ) for s in raw.split('\n'): try: obj = json.loads(s) diff --git a/sysom_server/sysom_diagnosis/service_scripts/iohang_post b/sysom_server/sysom_diagnosis/service_scripts/iohang_post index 7617dba6b1e23c123bb23966f0b0861f6d17742d..11d8e63a64348bc26a90760307831f7255640c0b 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/iohang_post +++ b/sysom_server/sysom_diagnosis/service_scripts/iohang_post @@ -2,13 +2,15 @@ # coding=utf-8 import sys import json - +import time +import re +from collections import OrderedDict def iosdiagJoinData(raw): postprocess_result = { "code": 0, "err_msg": "", - "result": {} + "result": {}, } if raw.startswith('fail'): postprocess_result["code"] = 1 @@ -24,13 +26,16 @@ def iosdiagJoinData(raw): "data": [{'key': 'Check Result', "value": "normal"}, {'key': "Number of OS HANG", "value": 0}, {'key': "Number of Disk HANG", "value": 0}]} - stat["summary"] = "diagnose results: Normal, No IO Hang" + stat["summary"] = {"status": "normal", "cause": "", + "suggestion": ""} + reasonList, suggestList = [], [] for s in raw.split('\n'): try: obj = json.loads(s) except Exception: continue + # stat["summary"]["cause"], stat["summary"]["suggestion"] = iohangResultReport(obj) dataSource = "singleIO_" if "percent" in str(obj): disks = [s['diskname'] for s in obj['summary'] @@ -52,7 +57,7 @@ def iosdiagJoinData(raw): stat[dataSource+d]["data"] = s["hung ios"] else: count = sum([io["count"] for io in s["hung ios"]]) - status = "abnormal" if count else "normal" + status = "warning" if count else "normal" dataOverview = [ {'key': 'Check for '+d, "value": status}] maxCount = 0 @@ -66,11 +71,20 @@ def iosdiagJoinData(raw): maxDelayComp = io["component"] stat[dataSource+d]["data"] = dataOverview - if 'Abnormal' not in stat["summary"]: - stat["summary"] = "diagnose results: Abnormal, " - stat["summary"] += \ - ("The IO of disk %s is hang, caused by %s hang;" % ( - d, maxDelayComp)) + if 'Abnormal' not in stat["summary"]["status"]: + stat["summary"]["status"] = "warning" + reasonList.append(("The IO of disk %s is hang, caused by %s hang" % + (d, maxDelayComp))) + if maxDelayComp == 'Disk' or maxDelayComp == 'OS': + suggestList.append( + ("Please confirm whether the disk %s is normal" % (d))) + if maxDelayComp == 'OS': + suggestList.append(("Please ask the OS kernel expert" )) + stat["summary"]["cause"] = "; \n".join(["{}. {}".format(i+1, reason) \ + for i, reason in enumerate(reasonList)]) + stat["summary"]["suggestion"] = "; \n".join(["{}. {}".format(i+1, suggest) \ + for i, suggest in enumerate(suggestList)]) + postprocess_result["result"] = stat s = json.dumps(postprocess_result, indent=4) print(s) diff --git a/sysom_server/sysom_diagnosis/service_scripts/iolatency b/sysom_server/sysom_diagnosis/service_scripts/iolatency index 4ca9dbe61106679d3cd1615d8ea6da2d6f72b13d..29550374f2649b7fbe03c8e752b5f4ee30e44065 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/iolatency +++ b/sysom_server/sysom_diagnosis/service_scripts/iolatency @@ -26,21 +26,22 @@ cmd0 = {} cmd0['instance'] = args["instance"] cmd0_arg_t = args.get("threshold","1000") if int(cmd0_arg_t) < 0: - cmd0_arg_t = 0 + cmd0_arg_t = 10 cmd0_arg_T = args.get("timeout","10") if int(cmd0_arg_T) <= 0: cmd0_arg_T = 10 cmd0_arg_device = args.get("disk","") -dump_log_cmd = "cat /var/log/sysak/iosdiag/latency/result.log.stat 2>/dev/null;\ - echo \"\";cat /var/log/sysak/iosdiag/latency/result.log.seq 2>/dev/null;\ - echo \"\";cat /var/log/sysak/iosdiag/latency/result.log 2>/dev/null;" +tools_path = "/usr/local/sysak/.sysak_components/tools/" +add_threshold_cmd = "echo "+str(cmd0_arg_t)+" >> /var/log/sysak/iosdiag/latency/result.log.seq;" +dump_log_cmd ="cat /var/log/sysak/iosdiag/latency/result.log.seq 2>/dev/null;" iosdiag_cmd = "sysak -g iosdiag latency -t "+str(cmd0_arg_t)+" -T "+str(cmd0_arg_T)+" "+cmd0_arg_device+" > /dev/null" print_result_cmd = "if [ ! -e /var/log/sysak/iosdiag/latency/result.log.seq ]; then "+\ - "echo \"fail\"; elif [ -e /var/log/sysak/iosdiag/latency/result.log.stat ]; then "+dump_log_cmd+\ - "else echo \"\"; fi" -cmd0['cmd'] = "rm /var/log/sysak/iosdiag/latency/* -f && "+iosdiag_cmd+" && "+print_result_cmd + "echo \"fail\"; else "+add_threshold_cmd+dump_log_cmd+" fi" +cmd0['cmd'] = "rm /var/log/sysak/iosdiag/latency/* -f && mv "+tools_path+"iosdiag_data_analysis "+tools_path+\ + "iosdiag_data_analysis.bak && "+iosdiag_cmd+" && "+print_result_cmd+" && mv "+tools_path+"iosdiag_data_analysis.bak "+\ + tools_path+"iosdiag_data_analysis" result['commands'].append(cmd0) data = json.dumps(result) -print(data) +print(data) \ No newline at end of file diff --git a/sysom_server/sysom_diagnosis/service_scripts/iolatency_post b/sysom_server/sysom_diagnosis/service_scripts/iolatency_post index cc1ed7690f2f671991f02b6da2ca2ee998e32f45..32fb0ba2c3ebbafd61abfa8bcef214019cb7dbe4 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/iolatency_post +++ b/sysom_server/sysom_diagnosis/service_scripts/iolatency_post @@ -3,7 +3,172 @@ import sys import json import re +import os +from collections import OrderedDict +if os.geteuid() != 0: + print("This program must be run as root. Aborting.") + sys.exit(0) + +def execCmd(cmd): + r = os.popen(cmd) + text = r.read() + r.close() + return text + +def humConvert(value): + units = ["B", "KB", "MB", "GB", "TB", "PB"] + size = 1024.0 + for i in range(len(units)): + if (value / size) < 1: + return "%.2f%s/s" % (value, units[i]) + value = value / size + +class latencyAnalysis: + def __init__(self): + self.delayStatDicts = {} + self.delayDicts = {} + self.summaryDicts = {} + self.totalIosDicts = {} + self.totalDelayDicts = {} + self.diskIdxDicts = {} + self.totalDiskCnt = 0 + self.threshold = 0 + self.componentDicts = OrderedDict([('os(block)',0),('os(driver)',1),\ + ('disk',2),('os(complete)',3),('os(done)',4)]) + self.delayStatJsonStr = \ + '{ \ + "diskname":"","delays":[ \ + {"component":"os(block)","percent":"","max":0,"min":1000000000,"avg":0},\ + {"component":"os(driver)","percent":"","max":0,"min":1000000000,"avg":0},\ + {"component":"disk","percent":"","max":0,"min":1000000000,"avg":0}, \ + {"component":"os(complete)","percent":"","max":0,"min":1000000000,"avg":0},\ + {"component":"os(done)","percent":"","max":0,"min":1000000000,"avg":0}]\ + }' + newDelayStatDict = json.loads("["+self.delayStatJsonStr + "]", object_pairs_hook=OrderedDict) + self.delayStatDicts.setdefault('summary', newDelayStatDict) + self.entryDictJsonStr = \ + '{ \ + "diskname":"",\ + "slow ios":[] \ + }' + newSummaryDict = json.loads("["+self.entryDictJsonStr + "]", object_pairs_hook=OrderedDict) + self.summaryDicts.setdefault('summary', newSummaryDict) + newDelayDict = json.loads("["+self.entryDictJsonStr + "]", object_pairs_hook=OrderedDict) + self.delayDicts.setdefault('summary', newDelayDict) + + def __newDiskDict(self, disk): + if self.totalDiskCnt != 0: + newDelayStatDict = json.loads(self.delayStatJsonStr, object_pairs_hook=OrderedDict) + self.delayStatDicts['summary'].append(newDelayStatDict) + newSummaryDict = json.loads(self.entryDictJsonStr, object_pairs_hook=OrderedDict) + self.summaryDicts['summary'].append(newSummaryDict) + newDelayDict = json.loads(self.entryDictJsonStr, object_pairs_hook=OrderedDict) + self.delayDicts['summary'].append(newDelayDict) + self.delayStatDicts['summary'][self.totalDiskCnt]['diskname'] = disk + self.summaryDicts['summary'][self.totalDiskCnt]['diskname'] = disk + self.delayDicts['summary'][self.totalDiskCnt]['diskname'] = disk + self.totalDelayDicts.setdefault(disk, 0) + self.totalIosDicts.setdefault(disk, 0) + self.diskIdxDicts.setdefault(disk, self.totalDiskCnt) + self.totalDiskCnt += 1 + + def processLatencyDelays(self, sDict): + diskIdxDicts = self.diskIdxDicts + totalDelayDicts = self.totalDelayDicts + componentDicts = self.componentDicts + delayStatDicts = self.delayStatDicts + delayDicts = self.delayDicts + + disk = sDict['diskname'] + del sDict['diskname'] + totalDelayDicts[disk] += sDict['totaldelay'] + diskIdx = diskIdxDicts[disk] + delayDicts['summary'][diskIdx]['slow ios'].append(sDict) + for component,idx in componentDicts.items(): + try: + delay = sDict['delays'][idx]['delay'] + except Exception: + return + if delay > delayStatDicts['summary'][diskIdx]['delays'][idx]['max']: + delayStatDicts['summary'][diskIdx]['delays'][idx]['max'] = delay + if delay < delayStatDicts['summary'][diskIdx]['delays'][idx]['min']: + delayStatDicts['summary'][diskIdx]['delays'][idx]['min'] = delay + delayStatDicts['summary'][diskIdx]['delays'][idx]['avg'] += delay + + def processLatencySummary(self, sDict): + diskIdxDicts = self.diskIdxDicts + summaryDicts = self.summaryDicts + + disk = sDict['diskname'] + diskIdx = diskIdxDicts[disk] + del sDict['diskname'] + listAbnormal=[i for i in sDict['abnormal'].split(' ') if i != ''] + msDelay=int(listAbnormal[-2].strip('(').split(':')[0]) / 1000.000 + msTotalDelay=int(listAbnormal[-2].strip('(').split(':')[1]) / 1000.000 + sDict['abnormal']=listAbnormal[0]+' '+listAbnormal[1]+" ("+str(msDelay)+":"+str(msTotalDelay)+" ms)" + summaryDicts['summary'][diskIdx]['slow ios'].append(sDict) + + def processOneLatencySeq(self, sDict): + totalIosDicts = self.totalIosDicts + disk = sDict['diskname'] + + if disk not in totalIosDicts.keys(): + self.__newDiskDict(disk) + + totalIosDicts[disk] += 1 + if "abnormal" in sDict: + self.processLatencySummary(sDict) + else: + self.processLatencyDelays(sDict) + + def latencyCalculate(self): + diskIdxDicts = self.diskIdxDicts + totalIosDicts = self.totalIosDicts + totalDelayDicts = self.totalDelayDicts + componentDicts = self.componentDicts + delayStatDicts = self.delayStatDicts + summaryDicts = self.summaryDicts + delayDicts = self.delayDicts + + for disk, diskIdx in diskIdxDicts.items(): + totalIosDicts[disk] = int(totalIosDicts[disk] / 2) + totalIos = totalIosDicts[disk] + maxPercent = 0 + avgTotalDelay = totalDelayDicts[disk] / totalIos + for component,idx in componentDicts.items(): + delayStatDicts['summary'][diskIdx]['delays'][idx]['avg'] /= totalIos + avgDelay = delayStatDicts['summary'][diskIdx]['delays'][idx]['avg'] + #percent = avgDelay * 100.0 / avgTotalDelay + percent = round((avgDelay * 100.0 / avgTotalDelay), 3) + if percent > maxPercent: + maxPercent = percent + delayStatDicts['summary'][diskIdx]['delays'][idx]['percent'] = str(percent)+"%" + +def latencyDataAnalysis(resultSeq, threshold): + analysis = latencyAnalysis() + resultSeqList = resultSeq.split('\n') + for s in resultSeqList[:-2]: + try: + sDict = json.loads(s, object_pairs_hook=OrderedDict) + except ValueError: + continue + analysis.processOneLatencySeq(sDict) + if analysis.totalDiskCnt == 0: + print("\n0 IOs over %d ms, everything is ok !^o^ ~" % int(threshold)) + return + analysis.latencyCalculate() + summary = json.dumps(analysis.delayStatDicts) + "\n" + json.dumps(analysis.delayDicts) +\ + "\n" + json.dumps(analysis.summaryDicts) + return summary + +def get_threshold(log): + length = len(log) + for i in range(1, length-1): + if log[length-i-1].isdigit() != True: + if log[length-i+1:length-1].isdigit(): + return int(log[length-i+1:length-1]) + return -1 def iosdiagJoinData(raw): postprocess_result = { @@ -16,15 +181,24 @@ def iosdiagJoinData(raw): postprocess_result["err_msg"] = f"Diagnosis failed:\n{raw}" print(json.dumps(postprocess_result, indent=4)) return - - raw = raw.strip() + + raw.strip() disks = [] stat = {} stat["disks"] = {"data": [{'key': 0, 'value': 'overview'}]} stat["iolatencyOverview_overview"] = { "data": [{'key': 'Check Result', "value": "normal"}, {'key': "IOs of over threshold", "value": 0}]} - stat["summary"] = "diagnose results: Normal, No slow IO over threshold" + stat["summary"] = {"status": "normal", "cause": "", "suggestion": ""} + + threshold = get_threshold(raw) + if threshold == -1: + postprocess_result['result'] = stat + s = json.dumps(postprocess_result, indent=4) + print(s) + return + + raw = latencyDataAnalysis(raw, 1000 if threshold == None else threshold) for s in raw.split('\n'): try: obj = json.loads(s) @@ -99,22 +273,43 @@ def iosdiagJoinData(raw): for d in delay['delays']: entry[d['component']] = d['delay'] stat[diskIdx]["data"].append(entry) + + reasonList, suggestList = [], [] for d in disks: + suggest = 'solution=\"Please ask the OS kernel expert\"' + reason = 'reason=\"Unknown\"' if 'singleIOMetrics_'+d in stat.keys(): count = len(stat['singleIOMetrics_'+d]["data"]) stat["iolatencyOverview_"+d] = { "data": [{'key': 'Check Result', "value": "abnormal"}, {'key': "IOs of over threshold", "value": count}]} if 'iolatencyDistribution_'+d in stat.keys(): - if 'Abnormal' not in stat["summary"]: - stat["summary"] = "diagnose results: Abnormal, " + if 'normal' == stat["summary"]["status"]: + stat["summary"]["status"] = "warning" delays = sorted(stat['iolatencyDistribution_'+d]["data"], key=lambda e: (float(e['value'].strip('%'))), reverse=True) - maxDelayComp = delays[0]['key'] - stat["summary"] += \ - ("The IO of disk %s is slow, caused by high %s latency;" % ( - d, maxDelayComp)) + maxDelayComp = delays[0]['key'] + if maxDelayComp == 'disk': + reason = ('Disk delay(disk %s processing IO slowly)' % (d)) + suggest= ('Please confirm whether the disk %s is normal' % (d)) + elif maxDelayComp == 'os(block)': + if delays[1]['component'] == 'disk' and \ + float(delays[1]['percent'].strip('%')) > 20: + reason = ('Disk delay(disk %s processing IO slowly)' % (d)) + suggest = ('Please confirm whether the disk %s is normal' % (d)) + continue + reason = 'OS delay(Issuing IO slowly at os(block))' + else: + reason = ('OS delay(processing IO slowly at %s)' %( + str(maxDelayComp))) + reasonList.append(reason) + suggestList.append(suggest) + stat["summary"]["cause"] = "; \n".join(["{}. {}".format(i+1, reason) \ + for i, reason in enumerate(reasonList)]) + stat["summary"]["suggestion"] = "; \n".join(["{}. {}".format(i+1, suggest) \ + for i, suggest in enumerate(suggestList)]) + postprocess_result['result'] = stat s = json.dumps(postprocess_result, indent=4) print(s) diff --git a/sysom_server/sysom_diagnosis/service_scripts/jitter_post b/sysom_server/sysom_diagnosis/service_scripts/jitter_post index d6c7a3b69e3fc233f3134d79033c1fa13f375585..153974610a9abf8424b8f93a30ca7e1324f5640b 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/jitter_post +++ b/sysom_server/sysom_diagnosis/service_scripts/jitter_post @@ -153,7 +153,28 @@ def jitter_result(rawdata): "err_msg": "", "result": {} } + + sender_send_stat = (sender_send.max - sender_send.min)/sender_send.max + sender_recv_stat = (sender_recv.max - sender_recv.min)/sender_recv.max + sender_out_stat = (sender_out.max - sender_out.min)/sender_out.max + postprocess_result["result"] = newjitter + status = "normal" + cause = "" + suggestion = "" + + if sender_send_stat > 0.5 or sender_recv_stat > 0.5 or sender_out_stat > 0.5: + status = "error" + cause = "网络发送路径出现抖动" + cur = sender_send_stat + if sender_recv_stat > cur: + cur = sender_recv_stat + cause = "网络接收路径出现抖动" + + if sender_out_stat > cur: + cause = "网络外部链路出现抖动" + + postprocess_result["result"]["summary"] = {"status": status, "cause": cause, "suggestion":suggestion} print(json.dumps(postprocess_result, ensure_ascii=False)) diff --git a/sysom_server/sysom_diagnosis/service_scripts/loadtask_post b/sysom_server/sysom_diagnosis/service_scripts/loadtask_post index 2dc31c2b44114da7ffc4bd1eca0cd9f605457e5c..c5ebdd7428adc0a55365a9e49eeb85d5837011f4 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/loadtask_post +++ b/sysom_server/sysom_diagnosis/service_scripts/loadtask_post @@ -149,27 +149,41 @@ def parse_log(file): hardirq_summary = "NULL" softirq_summary = "NULL" io_summary = "NULL" + new_parse_data["summary"] = {} + new_parse_data["summary"]["suggestion"] = "N/A" if reason["sys"] == "false": sys_summary = " is normal;" else: sys_summary = " is abnormal;" + new_parse_data["summary"]["suggestion"] = "sys utils is high,use nosched tool to deep diagnosis;" if reason["irq"] == "false": hardirq_summary = " is normal;" else: hardirq_summary = " is abnormal;" + new_parse_data["summary"]["suggestion"] = "irq utils is high,use irqoff tool to deep diagnosis;" if reason["softirq"] == "false": softirq_summary = " is normal;" else: softirq_summary = " is abnormal;" + new_parse_data["summary"]["suggestion"] = "softirq utils is high,use softirq tool to deep diagnosis;" if reason["io"] == "false": io_summary = " is normal;" else: io_summary = " is abnormal;" + new_parse_data["summary"]["suggestion"] = "io utils is high,use iosdiag tool to deep diagnosis;" - new_parse_data["summary"] = "Load Influences Result:"+"sys utils" + sys_summary + \ + if int(float(count["uninterrupt tasks"])) >= 5: + new_parse_data["summary"]["suggestion"] = "D task counts is more than 5,please analyse calltrace of tasks by D task pie chart" + + if reason["sys"] == "false" and reason["irq"] == "false" and reason["softirq"] == "false" and reason["io"] == "false": + new_parse_data["summary"]["status"] = "normal" + else: + new_parse_data["summary"]["status"] = "warning" + + new_parse_data["summary"]["cause"] = "Load Influences Result:"+"sys utils" + sys_summary + \ "hardirq" + hardirq_summary + "softirq" + softirq_summary + "io" + io_summary new_parse_data["datataskcount"] = {"data": []} if "uninterrupt tasks" in count.keys() and "runnig tasks" in count.keys(): diff --git a/sysom_server/sysom_diagnosis/service_scripts/memgraph_post b/sysom_server/sysom_diagnosis/service_scripts/memgraph_post index 550eecf3be1381c4c16423fe8a547ca95ead739e..d72960632465d9a68d9ebacc34f73aa10d15132c 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/memgraph_post +++ b/sysom_server/sysom_diagnosis/service_scripts/memgraph_post @@ -30,9 +30,12 @@ def memgraph_result(raw): memgraph["kernel"] = rawdata["memGraph"]["kernelUsed"] newmemgraph = {} - newmemgraph["summary"] = "success" - if "summary" in rawdata: - newmemgraph["summary"] = rawdata["summary"] + newmemgraph["summary"] = {} + newmemgraph["summary"]['status'] = "success" + newmemgraph["summary"]['cause'] = "" + newmemgraph["summary"]['suggestion'] = "" + #if "summary" in rawdata: + # newmemgraph["summary"] = rawdata["summary"] newmemgraph["dataMemEvent"] = {"data": {}} newmemgraph["dataMemEvent"]["data"] = [ {"key": "Util", "value": round(rawdata["event"]["util"], 2)}, @@ -43,6 +46,20 @@ def memgraph_result(raw): {"key": "MemFrag", "value": "NG" if rawdata["event"]["memfrag"] else "OK"} ] + if rawdata["event"]["leak"]: + newmemgraph["summary"]['status'] = "warning" + newmemgraph["summary"]['cause'] = "memory leak;" + newmemgraph["summary"]['suggestion'] += "use sysak memleak to detect;" + if rawdata["event"]["memcg"]: + if newmemgraph["summary"]['status'] == "": + newmemgraph["summary"]['status'] = "warning" + newmemgraph["summary"]['cause'] += "memcg leak;" + newmemgraph["summary"]['suggestion'] += "drop cache to reclaim zombie cgroups;" + if rawdata["event"]["memfrag"]: + if newmemgraph["summary"]['status'] == "": + newmemgraph["summary"]['status'] = "warning" + newmemgraph["summary"]['cause'] += "memory fragment;" + newmemgraph["summary"]['suggestion'] += "compact memory manually" newmemgraph["dataMemOverView"] = {"data": {}} newmemgraph["dataMemOverView"]["data"] = [ diff --git a/sysom_server/sysom_diagnosis/service_scripts/oomcheck_post b/sysom_server/sysom_diagnosis/service_scripts/oomcheck_post index 224c82aed99eb23dc85ef7cead5eab2ad61936a0..f30f33197cce11c3017ba71ca88f98d5bd55ce31 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/oomcheck_post +++ b/sysom_server/sysom_diagnosis/service_scripts/oomcheck_post @@ -147,7 +147,13 @@ def oomcheck_result(raw): {"key": cnt, "Tasks": task_info['task'], "Used": task_info['rss']}) cnt += 1 - result['summary'] = res.get('result', '') + #result['summary'] = res.get('result', '') + result['summary'] = {} + result['summary']['status'] = "" + if result['oomResult']['data'][1]["value"] != '' and result['oomResult']['data'][1]["value"] != '-': + result['summary']['status'] = "warning" + result['summary']['cause'] = result['oomAnalysis']['data'][0]["value"] + result['summary']['suggestion'] = result['oomAnalysis']['data'][1]["value"] print(json.dumps(postprocess_result, indent=4)) diff --git a/sysom_server/sysom_diagnosis/service_scripts/ossre_hook.py b/sysom_server/sysom_diagnosis/service_scripts/ossre_hook.py new file mode 100755 index 0000000000000000000000000000000000000000..35bdba9cae1cf1f0c80fdaedbaa9dc4fdcadf3c6 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/ossre_hook.py @@ -0,0 +1,177 @@ +""" +Time 2023/12/27 17:32 +Author: chenshiyan +Email chenshiyan@linux.alibaba.com +File ossre_hook.py +Description: +""" +import json,traceback +from apps.task.models import JobModel +from clogger import logger +from .base import HookProcessResult, DiagnosisHookProcessor +from sysom_utils import SysomFramework + +ERR_LEVEL = ["normal","warning","error","critical"] + +def make_release_ignore_option(instance_ip,key_tmp): + ret = {} + try: + ret = {"key":"ossre_release_ignore-%s"%key_tmp,"label":"解除屏蔽","type": "INVOKE_DIAGNOSIS_HOOK","params":{"type":"release_ignore","instance":instance_ip,"key":key_tmp}} + except: + traceback.print_exc() + pass + return ret + +def flush_data(instance_ip,origin_data): + try: + ossre_cache = SysomFramework.gcache("ossre") + total_abnormal_count = 0 + level_num = {"error":0,"warning":0,"critical":0} + for i in range(len(origin_data)): + if True: + abnormal_count = 0 + inspect_result = "无异常" + status = "normal" + if "children" in origin_data[i]: + if len(origin_data[i]["children"]) > 0: + for k in range(len(origin_data[i]["children"])): + abnormal_count_sub1 = 0 + inspect_result_sub1 = "" + status_sub1 = "normal" + all_sub1_ignore = 0 + if ossre_cache.load("%s-%s"%(instance_ip,origin_data[i]["children"][k]["key"])) == 0: + all_sub1_ignore = 1 + release_ignore_ret = make_release_ignore_option(instance_ip,origin_data[i]["children"][k]["key"]) + for x in range(len(origin_data[i]["children"][k]["options"])): + if origin_data[i]["children"][k]["options"][x]["label"] == "屏蔽检测": + del origin_data[i]["children"][k]["options"][x] + break + origin_data[i]["children"][k]["options"].append(release_ignore_ret) + + if "children" in origin_data[i]["children"][k]: + if len(origin_data[i]["children"][k]["children"]) > 0: + for x in range(len(origin_data[i]["children"][k]["children"])): + if ossre_cache.load("%s-%s"%(instance_ip,origin_data[i]["children"][k]["children"][x]["key"])) == 0 or all_sub1_ignore == 1: + origin_data[i]["children"][k]["children"][x]["status"] = "normal" + origin_data[i]["children"][k]["children"][x]["abnormal_count"] = 0 + release_ignore_ret = make_release_ignore_option(instance_ip,origin_data[i]["children"][k]["children"][x]["key"]) + origin_data[i]["children"][k]["children"][x]["options"].append(release_ignore_ret) + for y in range(len(origin_data[i]["children"][k]["children"][x]["options"])): + if origin_data[i]["children"][k]["children"][x]["options"][y]["label"] == "屏蔽检测": + del origin_data[i]["children"][k]["children"][x]["options"][y] + break + if all_sub1_ignore == 1: + for y in range(len(origin_data[i]["children"][k]["children"][x]["options"])): + if origin_data[i]["children"][k]["children"][x]["options"][y]["label"] == "解除屏蔽": + del origin_data[i]["children"][k]["children"][x]["options"][y] + break + else: + if ERR_LEVEL.index(origin_data[i]["children"][k]["children"][x]["status"]) > ERR_LEVEL.index(status_sub1): + status_sub1 = origin_data[i]["children"][k]["children"][x]["status"] + if ERR_LEVEL.index(origin_data[i]["children"][k]["children"][x]["status"]) > ERR_LEVEL.index(status): + status = origin_data[i]["children"][k]["children"][x]["status"] + abnormal_count_sub1 += origin_data[i]["children"][k]["children"][x]["abnormal_count"] + level_num[origin_data[i]["children"][k]["children"][x]["status"]] += 1 + if len(inspect_result_sub1) == 0: + inspect_result_sub1 = origin_data[i]["children"][k]["children"][x]["inspect_result"] + else: + inspect_result_sub1 = "%s\n%s"%(inspect_result_sub1,origin_data[i]["children"][k]["children"][x]["inspect_result"]) + abnormal_count += abnormal_count_sub1 + if abnormal_count > 0: + inspect_result = "存在异常" + origin_data[i]["children"][k]["status"] = status_sub1 + origin_data[i]["children"][k]["abnormal_count"] = abnormal_count_sub1 + origin_data[i]["children"][k]["inspect_result"] = inspect_result_sub1 + origin_data[i]["status"] = status + origin_data[i]["abnormal_count"] = abnormal_count + origin_data[i]["inspect_result"] = inspect_result + total_abnormal_count += abnormal_count + + except: + traceback.print_exc() + pass + return total_abnormal_count,level_num,origin_data + +def flush_checkItems(origin_checkItems,errnum,levelnum): + try: + ori_errnum = 0 + ori_warnnum = 0 + ori_critnum = 0 + ori_nornum = 0 + for i in range(len(origin_checkItems)): + if origin_checkItems[i]["key"] == "告警项": + ori_warnnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["warning"] + if origin_checkItems[i]["key"] == "错误项": + ori_errnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["error"] + if origin_checkItems[i]["key"] == "严重异常项": + ori_critnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["critical"] + + for i in range(len(origin_checkItems)): + if origin_checkItems[i]["key"] == "正常项": + ori_nornum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = ori_nornum + ori_errnum + ori_warnnum + ori_critnum - levelnum["warning"] - levelnum["error"] - levelnum["critical"] + except: + traceback.print_exc() + pass + return ori_nornum + ori_errnum + ori_warnnum + ori_critnum, origin_checkItems + +def flush_hostInfo(origin_hostInfo,errnum,levelnum,total_count): + try: + line_list = origin_hostInfo.replace("\n\n","\n").splitlines() + del line_list[-1] + line_list.append("共检测%s项,告警项%s个,错误项%s个,严重异常项%s个"%(total_count,levelnum["warning"],levelnum["error"],levelnum["critical"])) + origin_hostInfo = "\n\n".join(line_list) + except: + traceback.print_exc() + pass + return origin_hostInfo + +class HookProcessor(DiagnosisHookProcessor): + async def invoke_hook(self, instance: JobModel, params: dict) -> HookProcessResult: + ossre_cache = SysomFramework.gcache("ossre") + + op_type = params.get("type", "") + ossre_key = params.get("key", "") + instance_ip = params.get("instance", "") + if op_type == "ignore": + ossre_cache.store("%s-%s"%(instance_ip,ossre_key),0) + if op_type == "release_ignore": + ossre_cache.store("%s-%s"%(instance_ip,ossre_key),1) + + origin_result = json.loads(instance.result) + try: + web_data = origin_result["OssreResult"]["data"] + web_checkItems = origin_result["checkItems"]["data"] + web_hostInfo = origin_result["hostInfo"]["data"] + OssreResult_cache = ossre_cache.load("%s-OssreResult"%instance_ip) + if "data" not in OssreResult_cache: + origin_data = web_data + else: + origin_data = OssreResult_cache["data"] + if "checkItems" not in OssreResult_cache: + origin_checkItems = web_checkItems + else: + origin_checkItems = OssreResult_cache["checkItems"] + if "hostInfo" not in OssreResult_cache: + origin_hostInfo = web_hostInfo + else: + origin_hostInfo = OssreResult_cache["hostInfo"] + + total_abnormal_count,level_num,new_data = flush_data(instance_ip,origin_data) + total_count,new_checkItems = flush_checkItems(origin_checkItems,total_abnormal_count,level_num) + new_hostInfo = flush_hostInfo(origin_hostInfo,total_abnormal_count,level_num,total_count) + + origin_result["checkItems"]["data"] = new_checkItems + origin_result["OssreResult"]["data"] = new_data + origin_result["hostInfo"]["data"] = new_hostInfo + instance.result = json.dumps(origin_result) + await self.save_job(instance) + + except: + traceback.print_exc() + pass + + return HookProcessResult(code=200, data={}, err_msg="") diff --git a/sysom_server/sysom_diagnosis/service_scripts/ossre_post.py b/sysom_server/sysom_diagnosis/service_scripts/ossre_post.py new file mode 100755 index 0000000000000000000000000000000000000000..b84b8ba3ae54e8d257d4e30d640619134f56349c --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/ossre_post.py @@ -0,0 +1,648 @@ +""" +Time 2023/12/19 +Author: chenshiyan +File ossre_post.py +Description: +""" +from typing import List +from .base import DiagnosisJobResult, DiagnosisPostProcessor, PostProcessResult +import json +import time +import traceback +from clogger import logger +from sysom_utils import SysomFramework + +ERR_LEVEL = ["normal","warning","error","critical"] +ERR_LEVEL_CN = ["正常项","告警项","错误项","致命项"] + +TOOLS_MSG = { + "memgraph":{"key":"sysom_diagnosis:memgraph","url":"/diagnose/memory/memgraph?","cn_name":"内存大盘","params":{"instance":""}}, + "oomcheck":{"key":"sysom_diagnosis:oomcheck","url":"/diagnose/memory/oomcheck?","cn_name":"OOM诊断","params":{"instance":""}}, + "podmem":{"key":"sysom_diagnosis:podmem","url":"/diagnose/memory/podmem?","cn_name":"内存异常诊断","params":{"instance":""}}, + "loadtask":{"key":"sysom_diagnosis:laodtask","url":"/diagnose/cpu/podmem?","cn_name":"系统负载诊断","params":{"instance":""}} +} + +def make_tool_option(tool,instance): + ret = {} + try: + if tool not in TOOLS_MSG: + return ret + tool_pms = "" + for x in TOOLS_MSG[tool]["params"]: + if len(tool_pms) > 0: + tool_pms = "%s&"%tool_pms + if x == "instance": + tool_pms = "%s%s=%s"%(tool_pms,x,instance) + else: + tool_pms = "%s%s=%s"%(tool_pms,x,TOOLS_MSG[tool]["params"][x]) + + ret = {"key":TOOLS_MSG[tool]["key"],"label":TOOLS_MSG[tool]["cn_name"],"type": "LINK","url":"%s%s"%(TOOLS_MSG[tool]["url"],tool_pms)} + except Exception as e: + logger.exception(e) + pass + return ret + +def make_ignore_option(instance_ip,key_tmp): + ret = {} + try: + ret = {"key":"ossre_ignore-%s"%key_tmp,"label":"屏蔽检测","type": "INVOKE_DIAGNOSIS_HOOK","params":{"type":"ignore","instance":instance_ip,"key":key_tmp}} + except Exception as e: + logger.exception(e) + pass + return ret + +def make_release_ignore_option(instance_ip,key_tmp): + ret = {} + try: + ret = {"key":"ossre_release_ignore-%s"%key_tmp,"label":"解除屏蔽","type": "INVOKE_DIAGNOSIS_HOOK","params":{"type":"release_ignore","instance":instance_ip,"key":key_tmp}} + except Exception as e: + logger.exception(e) + traceback.print_exc() + pass + return ret + +def flush_data(instance_ip,origin_data): + try: + ossre_cache = SysomFramework.gcache("ossre") + total_abnormal_count = 0 + level_num = {"error":0,"warning":0,"critical":0} + for i in range(len(origin_data)): + if True: + abnormal_count = 0 + inspect_result = "无异常" + status = "normal" + if "children" in origin_data[i]: + if len(origin_data[i]["children"]) > 0: + for k in range(len(origin_data[i]["children"])): + abnormal_count_sub1 = 0 + inspect_result_sub1 = "" + status_sub1 = "normal" + all_sub1_ignore = 0 + if ossre_cache.load("%s-%s"%(instance_ip,origin_data[i]["children"][k]["key"])) == 0: + all_sub1_ignore = 1 + release_ignore_ret = make_release_ignore_option(instance_ip,origin_data[i]["children"][k]["key"]) + for x in range(len(origin_data[i]["children"][k]["options"])): + if origin_data[i]["children"][k]["options"][x]["label"] == "屏蔽检测": + del origin_data[i]["children"][k]["options"][x] + break + origin_data[i]["children"][k]["options"].append(release_ignore_ret) + + if "children" in origin_data[i]["children"][k]: + if len(origin_data[i]["children"][k]["children"]) > 0: + for x in range(len(origin_data[i]["children"][k]["children"])): + if ossre_cache.load("%s-%s"%(instance_ip,origin_data[i]["children"][k]["children"][x]["key"])) == 0 or all_sub1_ignore == 1: + origin_data[i]["children"][k]["children"][x]["status"] = "normal" + origin_data[i]["children"][k]["children"][x]["abnormal_count"] = 0 + release_ignore_ret = make_release_ignore_option(instance_ip,origin_data[i]["children"][k]["children"][x]["key"]) + origin_data[i]["children"][k]["children"][x]["options"].append(release_ignore_ret) + for y in range(len(origin_data[i]["children"][k]["children"][x]["options"])): + if origin_data[i]["children"][k]["children"][x]["options"][y]["label"] == "屏蔽检测": + del origin_data[i]["children"][k]["children"][x]["options"][y] + break + if all_sub1_ignore == 1: + for y in range(len(origin_data[i]["children"][k]["children"][x]["options"])): + if origin_data[i]["children"][k]["children"][x]["options"][y]["label"] == "解除屏蔽": + del origin_data[i]["children"][k]["children"][x]["options"][y] + break + else: + if ERR_LEVEL.index(origin_data[i]["children"][k]["children"][x]["status"]) > ERR_LEVEL.index(status_sub1): + status_sub1 = origin_data[i]["children"][k]["children"][x]["status"] + if ERR_LEVEL.index(origin_data[i]["children"][k]["children"][x]["status"]) > ERR_LEVEL.index(status): + status = origin_data[i]["children"][k]["children"][x]["status"] + abnormal_count_sub1 += origin_data[i]["children"][k]["children"][x]["abnormal_count"] + level_num[origin_data[i]["children"][k]["children"][x]["status"]] += 1 + if len(inspect_result_sub1) == 0: + inspect_result_sub1 = origin_data[i]["children"][k]["children"][x]["inspect_result"] + else: + inspect_result_sub1 = "%s\n%s"%(inspect_result_sub1,origin_data[i]["children"][k]["children"][x]["inspect_result"]) + abnormal_count += abnormal_count_sub1 + if abnormal_count > 0: + inspect_result = "存在异常" + origin_data[i]["children"][k]["status"] = status_sub1 + origin_data[i]["children"][k]["abnormal_count"] = abnormal_count_sub1 + origin_data[i]["children"][k]["inspect_result"] = inspect_result_sub1 + origin_data[i]["status"] = status + origin_data[i]["abnormal_count"] = abnormal_count + origin_data[i]["inspect_result"] = inspect_result + total_abnormal_count += abnormal_count + + except Exception as e: + logger.exception(e) + pass + return total_abnormal_count,level_num,origin_data + +def flush_checkItems(origin_checkItems,errnum,levelnum): + try: + ori_errnum = 0 + ori_warnnum = 0 + ori_critnum = 0 + ori_nornum = 0 + for i in range(len(origin_checkItems)): + if origin_checkItems[i]["key"] == "告警项": + ori_warnnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["warning"] + if origin_checkItems[i]["key"] == "错误项": + ori_errnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["error"] + if origin_checkItems[i]["key"] == "严重异常项": + ori_critnum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = levelnum["critical"] + + for i in range(len(origin_checkItems)): + if origin_checkItems[i]["key"] == "正常项": + ori_nornum = origin_checkItems[i]["value"] + origin_checkItems[i]["value"] = ori_nornum + ori_errnum + ori_warnnum + ori_critnum - levelnum["warning"] - levelnum["error"] - levelnum["critical"] + except Exception as e: + logger.exception(e) + pass + return ori_nornum + ori_errnum + ori_warnnum + ori_critnum, origin_checkItems + +def flush_hostInfo(origin_hostInfo,errnum,levelnum,total_count): + try: + line_list = origin_hostInfo.replace("\n\n","\n").splitlines() + del line_list[-1] + #line_list.append("共检测%s项,疑似有%s项风险"%(total_count,errnum)) + line_list.append("共检测%s项,告警项%s个,错误项%s个,严重异常项%s个"%(total_count,levelnum["warning"],levelnum["error"],levelnum["critical"])) + origin_hostInfo = "\n\n".join(line_list) + except Exception as e: + logger.exception(e) + pass + return origin_hostInfo + +def sysak_to_sysom_info(sysakdict): + retdict = {} + retdict["success"] = True + retdict["num_total"] = 0 + retdict["num_error"] = 0 + retdict["level_num"] = {"error":0,"warning":0,"critical":0} + retdict["critical_level_num"] = 0 + retdict["indicator"] = {"sched":{},"mem":{},"net":{},"io":{},"misc":{},"errnum":0,"status":"normal","tool":[]} + retdict["config"] = {"sched":{},"mem":{},"net":{},"io":{},"misc":{},"errnum":0,"status":"normal","tool":[]} + retdict["issue"] = {"critical":{},"error":{},"warning":{},"errnum":0,"status":"normal","tool":[]} + retdict["log"] = {"critical":{},"error":{},"warning":{},"errnum":0,"status":"normal","tool":[]} + retdict["hw"] = {"critical":{},"error":{},"warning":{},"errnum":0,"status":"normal","tool":[]} + retdict["sysinfo"] = {"kernel_version":"","cpu_info":"","mem_info":""} + try: + if sysakdict["success"] != "true": + retdict["success"] = False + else: + retdict["sysinfo"]["kernel_version"] = sysakdict["fields"]["SYSINFO"]["kernel_version"] + retdict["sysinfo"]["cpu_info"] = sysakdict["fields"]["SYSINFO"]["cpuinfo"] + retdict["sysinfo"]["mem_info"] = sysakdict["fields"]["SYSINFO"]["meminfo"] + for i in sysakdict["items"]["config"]: + for j in sysakdict["items"]["config"][i]: + retdict["num_total"] += 1 + if sysakdict["items"]["config"][i][j]["status"] == "warning": + retdict["num_error"] += 1 + level_tmp = "warning" + if sysakdict["items"]["config"][i][j]["level"] in ERR_LEVEL: + level_tmp = sysakdict["items"]["config"][i][j]["level"] + retdict["level_num"][level_tmp] += 1 + if len(retdict["config"][i]) == 0: + retdict["config"][i] = {"errnum":0,"status":"normal","tool":[],"summary":"","sub_items":{}} + if sysakdict["items"]["config"][i][j]["level"] in ERR_LEVEL: + if ERR_LEVEL.index(sysakdict["items"]["config"][i][j]["level"]) > ERR_LEVEL.index(retdict["config"][i]["status"]): + retdict["config"][i]["status"] = sysakdict["items"]["config"][i][j]["level"] + if ERR_LEVEL.index(sysakdict["items"]["config"][i][j]["level"]) > ERR_LEVEL.index(retdict["config"]["status"]): + retdict["config"]["status"] = sysakdict["items"]["config"][i][j]["level"] + retdict["config"][i]["errnum"] += 1 + retdict["config"]["errnum"] += 1 + if len(retdict["config"][i]["summary"]) > 0: + retdict["config"][i]["summary"] = "%s\n%s"%(retdict["config"][i]["summary"],sysakdict["items"]["config"][i][j]["suggestion"]) + else: + retdict["config"][i]["summary"] = sysakdict["items"]["config"][i][j]["suggestion"] + retdict["config"][i]["sub_items"][j] = {"status":"normal","summary":"-","tool":""} + retdict["config"][i]["sub_items"][j]["status"] = sysakdict["items"]["config"][i][j]["level"] + retdict["config"][i]["sub_items"][j]["summary"] = sysakdict["items"]["config"][i][j]["suggestion"] + if len(sysakdict["items"]["config"][i][j]["tool"]) > 0: + retdict["config"][i]["sub_items"][j]["tool"] = sysakdict["items"]["config"][i][j]["tool"] + if sysakdict["items"]["config"][i][j]["tool"] not in retdict["config"][i]["tool"]: + retdict["config"][i]["tool"].append(sysakdict["items"]["config"][i][j]["tool"]) + if sysakdict["items"]["config"][i][j]["tool"] not in retdict["config"]["tool"]: + retdict["config"]["tool"].append(sysakdict["items"]["config"][i][j]["tool"]) + + for i in sysakdict["items"]["indicator"]: + for j in sysakdict["items"]["indicator"][i]: + retdict["num_total"] += 1 + if sysakdict["items"]["indicator"][i][j]["status"] == "warning": + retdict["num_error"] += 1 + + level_tmp = "warning" + if sysakdict["items"]["indicator"][i][j]["level"] in ERR_LEVEL: + level_tmp = sysakdict["items"]["indicator"][i][j]["level"] + retdict["level_num"][level_tmp] += 1 + + if len(retdict["indicator"][i]) == 0: + retdict["indicator"][i] = {"errnum":0,"status":"normal","tool":[],"summary":"","sub_items":{}} + if sysakdict["items"]["indicator"][i][j]["level"] in ERR_LEVEL: + if ERR_LEVEL.index(sysakdict["items"]["indicator"][i][j]["level"]) > ERR_LEVEL.index(retdict["indicator"][i]["status"]): + retdict["indicator"][i]["status"] = sysakdict["items"]["indicator"][i][j]["level"] + if ERR_LEVEL.index(sysakdict["items"]["indicator"][i][j]["level"]) > ERR_LEVEL.index(retdict["indicator"]["status"]): + retdict["indicator"]["status"] = sysakdict["items"]["indicator"][i][j]["level"] + retdict["indicator"][i]["errnum"] += 1 + retdict["indicator"]["errnum"] += 1 + + retdict["indicator"][i]["sub_items"][j] = {"status":"normal","summary":"-","tool":""} + retdict["indicator"][i]["sub_items"][j]["status"] = sysakdict["items"]["indicator"][i][j]["level"] + retdict["indicator"][i]["sub_items"][j]["summary"] = sysakdict["items"]["indicator"][i][j]["suggestion"] + + if len(sysakdict["items"]["indicator"][i][j]["summary"]) > 0: + if len(retdict["indicator"][i]["summary"]) > 0: + retdict["indicator"][i]["summary"] = "%s\n%s"%(retdict["indicator"][i]["summary"],sysakdict["items"]["indicator"][i][j]["summary"]) + else: + retdict["indicator"][i]["summary"] = sysakdict["items"]["indicator"][i][j]["summary"] + retdict["indicator"][i]["sub_items"][j]["summary"] = sysakdict["items"]["indicator"][i][j]["summary"] + else: + if len(retdict["indicator"][i]["summary"]) > 0: + retdict["indicator"][i]["summary"] = "%s\n%s"%(retdict["indicator"][i]["summary"],sysakdict["items"]["indicator"][i][j]["suggestion"]) + else: + retdict["indicator"][i]["summary"] = sysakdict["items"]["indicator"][i][j]["suggestion"] + retdict["indicator"][i]["sub_items"][j]["summary"] = sysakdict["items"]["indicator"][i][j]["suggestion"] + + if len(sysakdict["items"]["indicator"][i][j]["tool"]) > 0: + retdict["indicator"][i]["sub_items"][j]["tool"] = sysakdict["items"]["indicator"][i][j]["tool"] + if sysakdict["items"]["indicator"][i][j]["tool"] not in retdict["indicator"][i]["tool"]: + retdict["indicator"][i]["tool"].append(sysakdict["items"]["indicator"][i][j]["tool"]) + if sysakdict["items"]["indicator"][i][j]["tool"] not in retdict["indicator"]["tool"]: + retdict["indicator"]["tool"].append(sysakdict["items"]["indicator"][i][j]["tool"]) + + retdict["num_total"] += len(sysakdict["items"]["hw"]) + for i in sysakdict["items"]["hw"]: + level = sysakdict["items"]["hw"][i]["level"] + if sysakdict["items"]["hw"][i]["status"] == "warning": + retdict["num_error"] += 1 + if level not in ERR_LEVEL: + level = "warning" + retdict["level_num"][level] += 1 + + if len(retdict["hw"][level]) == 0: + retdict["hw"][level] = {"errnum":0,"summary":"","sub_items":{}} + retdict["hw"][level]["sub_items"][i] = sysakdict["items"]["hw"][i] + retdict["hw"][level]["sub_items"][i]["status"] = level + retdict["hw"][level]["errnum"] += 1 + if len(retdict["hw"][level]["summary"]) > 0: + retdict["hw"][level]["summary"] = "%s\n%s"%(retdict["hw"][level]["summary"], sysakdict["items"]["hw"][i]["summary"]) + else: + retdict["hw"][level]["summary"] = sysakdict["items"]["hw"][i]["summary"] + + retdict["num_total"] += len(sysakdict["items"]["log"]) + for i in sysakdict["items"]["log"]: + level = sysakdict["items"]["log"][i]["level"] + if sysakdict["items"]["log"][i]["status"] == "warning": + retdict["num_error"] += 1 + if level not in ERR_LEVEL: + level = "warning" + retdict["level_num"][level] += 1 + + if len(retdict["log"][level]) == 0: + retdict["log"][level] = {"errnum":0,"summary":"","sub_items":{}} + retdict["log"][level]["sub_items"][i] = sysakdict["items"]["log"][i] + retdict["log"][level]["sub_items"][i]["status"] = level + retdict["log"][level]["errnum"] += 1 + if len(retdict["log"][level]["summary"]) > 0: + retdict["log"][level]["summary"] = "%s\n%s"%(retdict["log"][level]["summary"], sysakdict["items"]["log"][i]["summary"]) + else: + retdict["log"][level]["summary"] = sysakdict["items"]["log"][i]["summary"] + + retdict["num_total"] += len(sysakdict["items"]["issue"]) + for i in sysakdict["items"]["issue"]: + level = sysakdict["items"]["issue"][i]["level"] + if sysakdict["items"]["issue"][i]["return"] == True: + retdict["num_error"] += 1 + if level not in ERR_LEVEL: + level = "warning" + retdict["level_num"][level] += 1 + + if len(retdict["issue"][level]) == 0: + retdict["issue"][level] = {"errnum":0,"summary":"","sub_items":{}} + retdict["issue"][level]["sub_items"][i] = {} + retdict["issue"][level]["sub_items"][i]["summary"] = sysakdict["items"]["issue"][i]["solution"] + retdict["issue"][level]["sub_items"][i]["status"] = level + + retdict["issue"][level]["errnum"] += 1 + if len(retdict["issue"][level]["summary"]) > 0: + retdict["issue"][level]["summary"] = "%s\n%s"%(retdict["issue"][level]["summary"], sysakdict["items"]["issue"][i]["solution"]) + else: + retdict["issue"][level]["summary"] = sysakdict["items"]["issue"][i]["solution"] + + except Exception as e: + logger.exception(e) + retdict["success"] = False + pass + return retdict + +class PostProcessor(DiagnosisPostProcessor): + def parse_diagnosis_result(self, results: List[DiagnosisJobResult]) -> PostProcessResult: + postprocess_result = PostProcessResult( + code=1, + err_msg="", + result={} + ) + + ossre_ret = results[0].stdout + instance_ip = results[0].job.instance + logger.info(ossre_ret) + datas = [] + piedatas = [] + mddata = "" + summary_final = "" + status_final = "normal" + suggestion_final = "" + + try: + results = json.loads(ossre_ret) + retdict = sysak_to_sysom_info(results["ossre"]) + num = 0 + if retdict["success"] == True: + config = {"key":"config","inspect_items":"配置检查","status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + config["status"] = retdict["config"]["status"] + config["abnormal_count"] = retdict["config"]["errnum"] + config["inspect_result"] = "无异常" + if config["abnormal_count"] > 0: + config["inspect_result"] = "存在异常" + + config["options"] = "" + num += 1 + for i in retdict["config"]: + if i != "io" and i != "mem" and i != "net" and i != "misc" and i != "sched": + continue + + key_tmp = "config-%s"%i + tmpd = {"key":key_tmp,"inspect_items":i,"status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + + if len(retdict["config"][i]) == 0: + config["children"].append(tmpd) + num += 1 + continue + + if retdict["config"][i]["errnum"] > 0: + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + tmpd["options"].append(ignore_ret) + + for j in retdict["config"][i]["sub_items"]: + options_sub = [] + key_tmp = "config-%s-%s"%(i,j) + if retdict["config"][i]["sub_items"][j]["status"] != "normal": + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + options_sub.append(ignore_ret) + + if len(retdict["config"][i]["sub_items"][j]["tool"]) > 0: + tp_tmp = make_tool_option(retdict["config"][i]["sub_items"][j]["tool"],instance_ip) + if len(tp_tmp) > 0: + options_sub.append(tp_tmp) + tmpd["children"].append({"key":key_tmp,"inspect_items":j,"status":retdict["config"][i]["sub_items"][j]["status"],"abnormal_count":1,"inspect_result":retdict["config"][i]["sub_items"][j]["summary"],"options":options_sub}) + + tmpd["status"] = retdict["config"][i]["status"] + tmpd["abnormal_count"] = retdict["config"][i]["errnum"] + tmpd["inspect_result"] = retdict["config"][i]["summary"] + for t in retdict["config"][i]["tool"]: + tp_tmp = make_tool_option(t,instance_ip) + if len(tp_tmp) > 0: + tmpd["options"].append(tp_tmp) + config["children"].append(tmpd) + num += 1 + datas.append(config) + + indicator = {"key":"indicator","inspect_items":"指标检查","status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + indicator["status"] = retdict["indicator"]["status"] + indicator["abnormal_count"] = retdict["indicator"]["errnum"] + indicator["inspect_result"] = "无异常" + if indicator["abnormal_count"] > 0: + indicator["inspect_result"] = "存在异常" + indicator["options"] = "" + num += 1 + + for i in retdict["indicator"]: + if i != "io" and i != "mem" and i != "net" and i != "misc" and i != "sched": + continue + key_tmp = "indicator-%s"%i + tmpd = {"key":key_tmp,"inspect_items":i,"status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + if len(retdict["indicator"][i]) == 0: + indicator["children"].append(tmpd) + num += 1 + continue + + if retdict["indicator"][i]["errnum"] > 0: + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + tmpd["options"].append(ignore_ret) + + for j in retdict["indicator"][i]["sub_items"]: + options_sub = [] + key_tmp = "indicator-%s-%s"%(i,j) + if retdict["indicator"][i]["sub_items"][j]["status"] != "normal": + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + options_sub.append(ignore_ret) + + if len(retdict["indicator"][i]["sub_items"][j]["tool"]) > 0: + tp_tmp = make_tool_option(retdict["indicator"][i]["sub_items"][j]["tool"],instance_ip) + if len(tp_tmp) > 0: + options_sub.append(tp_tmp) + tmpd["children"].append({"key":"indicator-%s-%s"%(i,j),"inspect_items":j,"status":retdict["indicator"][i]["sub_items"][j]["status"],"abnormal_count":1,"inspect_result":retdict["indicator"][i]["sub_items"][j]["summary"],"options":options_sub}) + + tmpd["status"] = retdict["indicator"][i]["status"] + tmpd["abnormal_count"] = retdict["indicator"][i]["errnum"] + tmpd["inspect_result"] = retdict["indicator"][i]["summary"] + tools = "" + for t in retdict["indicator"][i]["tool"]: + tp_tmp = make_tool_option(t,instance_ip) + if len(tp_tmp) > 0: + tmpd["options"].append(tp_tmp) + indicator["children"].append(tmpd) + num += 1 + datas.append(indicator) + + issue = {"key":"issue","inspect_items":"已知问题检查","status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + issue_num = 0 + if len(retdict["issue"]["warning"]) > 0: + issue["status"] = "warning" + issue_num += retdict["issue"]["warning"]["errnum"] + if len(retdict["issue"]["error"]) > 0: + issue["status"] = "error" + issue_num += retdict["issue"]["error"]["errnum"] + if len(retdict["issue"]["critical"]) > 0: + issue["status"] = "critical" + issue_num += retdict["issue"]["critical"]["errnum"] + + issue["abnormal_count"] = issue_num + issue["inspect_result"] = "无异常" + if issue["abnormal_count"] > 0: + issue["inspect_result"] = "存在异常" + issue["options"] = [] + num += 1 + + for i in retdict["issue"]: + if i != "critical" and i != "error" and i != "warning": + continue + key_tmp = "issue-%s"%i + tmpd = {"key":key_tmp,"inspect_items":ERR_LEVEL_CN[ERR_LEVEL.index(i)],"status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + if len(retdict["issue"][i]) > 0: + if retdict["issue"][i]["errnum"] > 0: + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + tmpd["options"].append(ignore_ret) + + for j in retdict["issue"][i]["sub_items"]: + options_sub = [] + key_tmp = "issue-%s-%s"%(i,j) + if retdict["issue"][i]["sub_items"][j]["status"] != "normal": + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + options_sub.append(ignore_ret) + tmpd["children"].append({"key":"issue-%s-%s"%(i,j),"inspect_items":j,"status":retdict["issue"][i]["sub_items"][j]["status"],"abnormal_count":1,"inspect_result":retdict["issue"][i]["sub_items"][j]["summary"],"options":options_sub}) + tmpd["status"] = i + tmpd["abnormal_count"] = retdict["issue"][i]["errnum"] + tmpd["inspect_result"] = retdict["issue"][i]["summary"] + issue["children"].append(tmpd) + num += 1 + datas.append(issue) + + log = {"key":"log","inspect_items":"日志检查","status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + log_num = 0 + if len(retdict["log"]["warning"]) > 0: + log["status"] = "warning" + log_num += retdict["log"]["warning"]["errnum"] + if len(retdict["log"]["error"]) > 0: + log["status"] = "error" + log_num += retdict["log"]["error"]["errnum"] + if len(retdict["log"]["critical"]) > 0: + log["status"] = "critical" + log_num += retdict["log"]["critical"]["errnum"] + + log["abnormal_count"] = log_num + log["inspect_result"] = "无异常" + if log["abnormal_count"] > 0: + log["inspect_result"] = "存在异常" + log["options"] = [] + num += 1 + + for i in retdict["log"]: + if i != "critical" and i != "error" and i != "warning": + continue + key_tmp = "log-%s"%i + tmpd = {"key":key_tmp,"inspect_items":ERR_LEVEL_CN[ERR_LEVEL.index(i)],"status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + if len(retdict["log"][i]) > 0: + if retdict["log"][i]["errnum"] > 0: + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + tmpd["options"].append(ignore_ret) + + for j in retdict["log"][i]["sub_items"]: + options_sub = [] + key_tmp = "log-%s-%s"%(i,j) + if retdict["log"][i]["sub_items"][j]["status"] != "normal": + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + options_sub.append(ignore_ret) + + if len(retdict["log"][i]["sub_items"][j]["tool"]) > 0: + tp_tmp = make_tool_option(retdict["log"][i]["sub_items"][j]["tool"],instance_ip) + if len(tp_tmp) > 0: + options_sub.append(tp_tmp) + tmpd["children"].append({"key":"log-%s-%s"%(i,j),"inspect_items":j,"status":retdict["log"][i]["sub_items"][j]["status"],"abnormal_count":1,"inspect_result":retdict["log"][i]["sub_items"][j]["summary"],"options":options_sub}) + tmpd["status"] = i + tmpd["abnormal_count"] = retdict["log"][i]["errnum"] + tmpd["inspect_result"] = retdict["log"][i]["summary"] + + log["children"].append(tmpd) + num += 1 + datas.append(log) + + hw = {"key":"hw","inspect_items":"硬件检查","status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + hw_num = 0 + if len(retdict["hw"]["warning"]) > 0: + hw["status"] = "warning" + hw_num += retdict["hw"]["warning"]["errnum"] + if len(retdict["hw"]["error"]) > 0: + hw["status"] = "error" + hw_num += retdict["hw"]["error"]["errnum"] + if len(retdict["hw"]["critical"]) > 0: + hw["status"] = "critical" + hw_num += retdict["hw"]["critical"]["errnum"] + + hw["abnormal_count"] = hw_num + hw["inspect_result"] = "无异常" + if hw["abnormal_count"] > 0: + hw["inspect_result"] = "存在异常" + hw["options"] = [] + num += 1 + + for i in retdict["hw"]: + if i != "critical" and i != "error" and i != "warning": + continue + key_tmp = "hw-%s"%i + tmpd = {"key":key_tmp,"inspect_items":ERR_LEVEL_CN[ERR_LEVEL.index(i)],"status":"normal","abnormal_count":0,"inspect_result":"","options":[],"children":[]} + + if len(retdict["hw"][i]) > 0: + if retdict["hw"][i]["errnum"] > 0: + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + tmpd["options"].append(ignore_ret) + + for j in retdict["hw"][i]["sub_items"]: + options_sub = [] + key_tmp = "hw-%s-%s"%(i,j) + if retdict["hw"][i]["sub_items"][j]["status"] != "normal": + ignore_ret = make_ignore_option(instance_ip,key_tmp) + if len(ignore_ret) > 0: + options_sub.append(ignore_ret) + + tmpd["children"].append({"key":"hw-%s-%s"%(i,j),"inspect_items":j,"status":retdict["hw"][i]["sub_items"][j]["status"],"abnormal_count":1,"inspect_result":retdict["hw"][i]["sub_items"][j]["summary"],"options":options_sub}) + + tmpd["status"] = i + tmpd["abnormal_count"] = retdict["hw"][i]["errnum"] + tmpd["inspect_result"] = retdict["hw"][i]["summary"] + hw["children"].append(tmpd) + num += 1 + datas.append(hw) + piedatas.append({"key":"告警项","value":retdict["level_num"]["warning"]}) + piedatas.append({"key":"错误项","value":retdict["level_num"]["error"]}) + piedatas.append({"key":"严重异常项","value":retdict["level_num"]["critical"]}) + piedatas.append({"key":"正常项","value":retdict["num_total"]-retdict["num_error"]}) + mddata = "%s### 内核版本\n\n%s\n\n"%(mddata,retdict["sysinfo"]["kernel_version"]) + mddata = "%s### CPU信息\n\nNuma Node:%s      CPU数量:%s      CPU Model:%s\n\n"%(mddata,retdict["sysinfo"]["cpu_info"]["numa"],retdict["sysinfo"]["cpu_info"]["cpunum"],retdict["sysinfo"]["cpu_info"]["model name"]) + mddata = "%s### 检测结果\n\n共检测%s项,告警项%s个,错误项%s个,严重异常项%s个\n\n"%(mddata,retdict["num_total"],retdict["level_num"]["warning"],retdict["level_num"]["error"],retdict["level_num"]["critical"]) + else: + datas.append({"key":0,"ossre":"Fail","result":"sysak_to_sysom_info fail!"}) + + OssreResult = {"data": datas,"checkItems":piedatas,"hostInfo":mddata} + ossre_cache = SysomFramework.gcache("ossre") + ossre_cache.store("%s-OssreResult"%instance_ip,OssreResult) + + total_abnormal_count,level_num,new_data = flush_data(instance_ip,datas) + total_count,new_checkItems = flush_checkItems(piedatas,total_abnormal_count,level_num) + new_hostInfo = flush_hostInfo(mddata,total_abnormal_count,level_num,total_count) + + summary_final = "共检测%s项,告警项%s个,错误项%s个,严重异常项%s个"%(total_count,level_num["warning"],level_num["error"],level_num["critical"]) + if level_num["warning"] > 0 : + status_final = "warning" + if level_num["error"] > 0 : + status_final = "error" + if level_num["critical"] > 0 : + status_final = "critical" + if status_final != "normal": + suggestion_final = "查看具体检查列表异常项,根据提示对异常进行处理或进一步诊断。" + + postprocess_result.code = 0 + + except Exception as e: + logger.exception(e) + postprocess_result.err_msg = e + pass + try: + postprocess_result.result = { + "OssreResult": {"data": new_data}, + "checkItems":{"data":new_checkItems}, + "hostInfo":{"data":new_hostInfo}, + "summary":{"status": status_final,"cause":summary_final,"suggestion":suggestion_final} + } + except Exception as e: + logger.exception(e) + postprocess_result.err_msg = e + pass + logger.info(postprocess_result.result) + return postprocess_result + diff --git a/sysom_server/sysom_diagnosis/service_scripts/ossre_pre.py b/sysom_server/sysom_diagnosis/service_scripts/ossre_pre.py new file mode 100644 index 0000000000000000000000000000000000000000..865501afa9bd12372004f7bc5a1cffa2fc4e4a1d --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/ossre_pre.py @@ -0,0 +1,24 @@ +""" +Time 2023/012/19 17:32 +Author: chenshiyan +Email chenshiyan@linux.alibaba.com +File ossre_pre.py +Description: +""" +from .base import DiagnosisJob, DiagnosisPreProcessor, DiagnosisTask + + +class PreProcessor(DiagnosisPreProcessor): + """ossre diagnosis + """ + + def get_diagnosis_cmds(self, params: dict) -> DiagnosisTask: + instance = params.get("instance", "") + command = "sysak ossre_client -s > /dev/null " + " && " + "cat /var/log/sysak/ossre.log" + print (command) + return DiagnosisTask( + jobs=[ + DiagnosisJob(instance=instance, cmd=command) + ], + in_order=False, + ) diff --git a/sysom_server/sysom_diagnosis/service_scripts/packetdrop_post b/sysom_server/sysom_diagnosis/service_scripts/packetdrop_post index ec91a51ea7df52a3f5f48c19dacd12df76446846..ed976a9edc23dddc83cdac290c533efb29169d5e 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/packetdrop_post +++ b/sysom_server/sysom_diagnosis/service_scripts/packetdrop_post @@ -39,7 +39,6 @@ HARDWARE_DROP = 0 DROP_REASONS = {} - def delta_map(pre, now): delta = {} for k, v in pre.items(): @@ -114,7 +113,7 @@ def packetdrop_result(res): "result": {}, # "rawresult": res, } - + if len(res) < 6: postprocess_result["code"] = 1 postprocess_result["err_msg"] = "Failed to parse output" @@ -152,6 +151,31 @@ def packetdrop_result(res): ] postprocess_result['result'] = packetdrop + status = "normal" + cause = "" + suggestion = "" + + if TCP_DROP != 0: + status = "error" + cause = "出现了tcp丢包" + suggestion = "" + + if UDP_DROP != 0: + status = "error" + cause = "出现了udp丢包" + suggestion = "" + + if ICMP_DROP != 0: + status = "error" + cause = "出现了icmp丢包" + suggestion = "" + + if HARDWARE_DROP != 0: + status = "error" + cause = "出现了硬件丢包" + suggestion = "" + + postprocess_result["result"]["summary"] = {"status": status, "cause": cause, "suggestion":suggestion} print(json.dumps(postprocess_result, ensure_ascii=False)) diff --git a/sysom_server/sysom_diagnosis/service_scripts/pingtrace_post b/sysom_server/sysom_diagnosis/service_scripts/pingtrace_post index 406898f52afaab6ccbcecd7ad5ef52d5ab9a2f5c..f8b71146ba3421549b0a4d9519e7179fcb342a20 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/pingtrace_post +++ b/sysom_server/sysom_diagnosis/service_scripts/pingtrace_post @@ -24,14 +24,27 @@ class CpingDeamon(object): dPing["seq"].append(obj) ret = {} + status = "normal" + cause = "" + suggestion = "" ret['pingtraceFlow'] = {} ret["pingtraceFlow"]['data'] = [] + cur = 0 for stage in dPing['stat']['stage']: ret["pingtraceFlow"]['data'].append( {'key': stage['delay'], 'title': stage['delay'], 'text': "Max:{} Min:{} Avg:{}".format( stage['max'], stage['min'], stage['avg'])} ) + tmp = (stage['max'] - stage['min']) / stage['max'] + if tmp > 0.5: + status = "error" + if tmp > cur: + cur = tmp + cause = stage['delay'] + "出现网络抖动" + + postprocess_result['result'] = ret + postprocess_result["result"]["summary"] = {"status": status, "cause": cause, "suggestion":suggestion} s = json.dumps(postprocess_result, indent=4) print(s) diff --git a/sysom_server/sysom_diagnosis/service_scripts/procdiag_post.py b/sysom_server/sysom_diagnosis/service_scripts/procdiag_post.py new file mode 100644 index 0000000000000000000000000000000000000000..159a2268d2a7c2c3711db4a88f63bc9f752c71ae --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/procdiag_post.py @@ -0,0 +1,42 @@ +import json +from typing import List +from .base import DiagnosisJobResult, DiagnosisPostProcessor, PostProcessResult + +class PostProcessor(DiagnosisPostProcessor): + def parse_diagnosis_result(self, results: List[DiagnosisJobResult]) -> PostProcessResult: + postprocess_result = PostProcessResult( + code=0, + err_msg="", + result={} + ) + + results = results[0].stdout + #结果字符串数据分割 + results = results.split("\n") + #是否写入数据标识 + write_off = True + index_flag = 0 + md_msg = '# 诊断结果' + for msg in results: + if "diagnosing report" in msg: + index_flag = results.index(msg) + write_off = False + if not write_off: + if "---" in msg: + continue + md_msg = md_msg + "\n\n" + msg + + md_msg = md_msg + "\n\n" + "# 诊断过程" + write_off = True + for msg in results[:index_flag]: + if "---" in msg: + write_off = False + msg = msg.replace("---", "") + msg = "\n## %s\n" % msg + if not write_off: + md_msg = md_msg + "\n\n" + msg + + postprocess_result.result = { + "procdiag_data": {"data": md_msg} + } + return postprocess_result diff --git a/sysom_server/sysom_diagnosis/service_scripts/procdiag_pre.py b/sysom_server/sysom_diagnosis/service_scripts/procdiag_pre.py new file mode 100644 index 0000000000000000000000000000000000000000..20d71a975c4d27b6af10b96a14821e04f4cdb618 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/procdiag_pre.py @@ -0,0 +1,33 @@ +import re +from .base import DiagnosisJob, DiagnosisPreProcessor, DiagnosisTask, FileItem +class PreProcessor(DiagnosisPreProcessor): + """Get release info diagnosis + Just invoke command in target instance and get stdout result + Args: + DiagnosisPreProcessor (_type_): _description_ + """ + def get_diagnosis_cmds(self, params: dict) -> DiagnosisTask: + # 从前端传递的参数中读取目标实例IP + instance = params.get("instance", "") + time = params.get("time", "") + ipport = params.get("ipport", "") + resp = is_valid_ip_port(ipport) + if not resp: + raise Exception("IP: PORT is empty or format does not meet the requirements") + time = int(time) * 10 + cmd = "sysak rtrace --tcpping --dst {} -c {} --period 0.1 --iqr".format(ipport, time) + return DiagnosisTask( + jobs=[DiagnosisJob(instance=instance, cmd=cmd)], + in_order = False, + ) + + +def is_valid_ip_port(ip_port): + pattern = r'^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})$' + match = re.match(pattern, ip_port) + if match: + ip = match.group(1) + port = int(match.group(2)) + if 0 <= port <= 65535: + return True + return False diff --git a/sysom_server/sysom_diagnosis/service_scripts/retran_post b/sysom_server/sysom_diagnosis/service_scripts/retran_post index bc2f557c2ccff2172d720775bb7a86ebf05dd964..f7d5d968080aa0d887ca8f5416e1c1f73d29b229 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/retran_post +++ b/sysom_server/sysom_diagnosis/service_scripts/retran_post @@ -134,6 +134,14 @@ def retran_result(res): newretran["summary"] = "diagnones result: total capture {} retransmits".format( retran_cnt) postprocess_result["result"] = newretran + status = "normal" + cause = "" + suggestion = "" + if retran_cnt != 0: + status = "error" + cause = newretran["summary"] + suggestion = "" + postprocess_result["result"]["summary"] = {"status": status, "cause": cause, "suggestion":suggestion} print(json.dumps(postprocess_result, ensure_ascii=False)) diff --git a/sysom_server/sysom_diagnosis/service_scripts/rtdelay_post.py b/sysom_server/sysom_diagnosis/service_scripts/rtdelay_post.py index 23c06b6b2a47e33baa83f5a0416a24a654450601..cba82ab2314b77cf53e897dde68fc2f25cfe2c2a 100644 --- a/sysom_server/sysom_diagnosis/service_scripts/rtdelay_post.py +++ b/sysom_server/sysom_diagnosis/service_scripts/rtdelay_post.py @@ -42,7 +42,9 @@ class PostProcessor(DiagnosisPostProcessor): except: pass postprocess_result.result = { - "request_set": {"data": datas} + "request_set": {"data": datas}, + "summary": {"status": "normal", "cause": "Just shows real-time lantency", + "suggestion": "Optimize the application based on the latency distribution."} } return postprocess_result diff --git a/sysom_server/sysom_diagnosis/service_scripts/schedmoni_post b/sysom_server/sysom_diagnosis/service_scripts/schedmoni_post index 136c7ec251c7cebd50ae33d493d1678ad261171b..280278f7b6dd8f84b227b8efe7b6256075071d01 100755 --- a/sysom_server/sysom_diagnosis/service_scripts/schedmoni_post +++ b/sysom_server/sysom_diagnosis/service_scripts/schedmoni_post @@ -20,17 +20,54 @@ def extract_params(): res = tmp.read() return res, task_id +def parse_summary(result): + jitsum = result["jitterEventSummary"] + sumdatas = jitsum["data"] + i = 1 + status = "normal" + _reasons = ["delay in runqueue(may be too may task?)", "Notice: cpu takes too many time in kernel", "Notice: irq disabled too long in kernel"] + reason = "N/A" + for data in sumdatas: + if data["value"] != "normal": + if status == "normal": + status = data["value"] + elif data["value"] == "emergency": + if status == "warning": + status = "error" + tmp = str(i)+". "+_reasons[i-1] + if reason == "N/A": + reason = tmp + else: + reason = reason + tmp + i = i + 1 + #print(status) + #print(reason) + sum_res = {} + summary = {} + summary["status"] = status + summary["cause"] = reason + if reason != "N/A": + summary["suggestion"] = "详情请检查/var/log/sysak/schedmoni/schedmoni.json" + else: + summary["suggestion"] = "N/A" + sum_res["summary"] = summary + #print(sum_res) + return sum_res if __name__ == "__main__": str1, _ = extract_params() str2 = str1.replace('\n', "") str3 = str2.replace('\t', "") data3 = json.loads(str3) + results = {} + summary = parse_summary(data3["datasources"]) + results["summary"] = summary["summary"] + results["datasources"] = data3["datasources"] postprocess_result = { "code": 0, "err_msg": "", "result": {} } - postprocess_result["result"] = data3["datasources"] + postprocess_result["result"] = results data = json.dumps(postprocess_result, indent=4, ensure_ascii=False) print(data) diff --git a/sysom_server/sysom_diagnosis/service_scripts/wrapper/__init__.py b/sysom_server/sysom_diagnosis/service_scripts/wrapper/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_server/sysom_diagnosis/service_scripts/wrapper/base.py b/sysom_server/sysom_diagnosis/service_scripts/wrapper/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f6e62ed39e1a878b12b88a0584564a2ec8d222d9 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/wrapper/base.py @@ -0,0 +1,8 @@ +from service_scripts.base import DiagnosisTask + + +class DiagnosisPreProcessorPostWrapperBase: + """诊断前处理后包装器,用于在诊断前处理后对前处理的结果进行统一加工,比如给命令加上 wrapper 等 + """ + def process(self, task_id: str, diagnosis_task: DiagnosisTask): + pass diff --git a/sysom_server/sysom_diagnosis/service_scripts/wrapper/dummy.py b/sysom_server/sysom_diagnosis/service_scripts/wrapper/dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..b309645f2e067ae88afcb85221489576c99b2072 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/wrapper/dummy.py @@ -0,0 +1,12 @@ +from service_scripts.base import DiagnosisTask +from .base import DiagnosisPreProcessorPostWrapperBase + +class DiagnosisPreProcessorPostWrapper(DiagnosisPreProcessorPostWrapperBase): + """所有命令的执行结果导出到临时文件并cat出来 + + Args: + DiagnosisPreProcessorPostWrapperBase (_type_): _description_ + """ + def process(self, task_id: str, diagnosis_task: DiagnosisTask): + for job in diagnosis_task.jobs: + job.cmd = f"({job.cmd}) > /tmp/{job.instance}.txt && cat /tmp/{job.instance}.txt" \ No newline at end of file diff --git a/sysom_server/sysom_hotfix/apps/hotfix/filters.py b/sysom_server/sysom_hotfix/apps/hotfix/filters.py new file mode 100644 index 0000000000000000000000000000000000000000..077308584d0d9afb461e837261fc6b689480ea14 --- /dev/null +++ b/sysom_server/sysom_hotfix/apps/hotfix/filters.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +''' +@Author: wb-msm241621 +@Date: 2023-12-15 11:40:19 +@LastEditTime: 2023-12-15 11:40:19 +@Description: hotfix released list filter +''' + +from django_filters.rest_framework import FilterSet, DateTimeFilter +from .models import ReleasedHotfixListModule + + +class HotfixReleasedFilter(FilterSet): + released_start_time = DateTimeFilter(field_name='released_time', lookup_expr='gte') + released_end_time = DateTimeFilter(field_name='released_time', lookup_expr='lte') + + class Meta: + model = ReleasedHotfixListModule + fields = ['hotfix_id', 'released_kernel_version', 'serious', 'fix_system'] diff --git a/sysom_server/sysom_hotfix/apps/hotfix/migrations/0005_releasedhotfixlistmodule.py b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0005_releasedhotfixlistmodule.py new file mode 100644 index 0000000000000000000000000000000000000000..98e413d536546dc19326111df58236ce9fcead6a --- /dev/null +++ b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0005_releasedhotfixlistmodule.py @@ -0,0 +1,37 @@ +# Generated by Django 3.2.16 on 2023-11-16 06:14 + +from django.db import migrations, models +import lib.utils + + +class Migration(migrations.Migration): + + dependencies = [ + ('hotfix', '0004_auto_20231020_1552'), + ] + + operations = [ + migrations.CreateModel( + name='ReleasedHotfixListModule', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.CharField(default=lib.utils.human_datetime, max_length=20, verbose_name='创建时间')), + ('deleted_at', models.CharField(max_length=20, null=True)), + ('hotfix_id', models.CharField(max_length=50, verbose_name='hotfix id')), + ('released_kernel_version', models.CharField(max_length=60, verbose_name='hotfix发布的内核版本')), + ('serious', models.IntegerField(choices=[(0, '可选安装'), (1, '建议安装'), (2, '需要安装')], default=0, verbose_name='hotfix推荐安装级别')), + ('description', models.TextField(default='', verbose_name='hotfix问题描述')), + ('fix_system', models.IntegerField(choices=[(0, '调度'), (1, '内存'), (2, '网络'), (3, '存储'), (4, '其他')], default=0, verbose_name='涉及子系统')), + ('released_time', models.DateTimeField(verbose_name='发布时间')), + ('download_link', models.TextField(default='', verbose_name='hotfix下载链接')), + ('deprecated', models.IntegerField(choices=[(0, '正常'), (1, '废弃')], default=0, verbose_name='该hotfix是否被废弃')), + ('deprecated_info', models.TextField(default='', verbose_name='废弃原因或信息')), + ('modified_time', models.DateTimeField(auto_now=True, verbose_name='记录修改时间')), + ('modified_user', models.CharField(default='', max_length=20, null=True, verbose_name='用于记录最后一次修改的人')), + ], + options={ + 'db_table': 'sys_released_hotfix', + 'ordering': ['-created_at'], + }, + ), + ] diff --git a/sysom_server/sysom_hotfix/apps/hotfix/migrations/0006_auto_20240103_1448.py b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0006_auto_20240103_1448.py new file mode 100644 index 0000000000000000000000000000000000000000..6285e11b7199fb9fd29fa60e95d6cc89d1066f87 --- /dev/null +++ b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0006_auto_20240103_1448.py @@ -0,0 +1,38 @@ +# Generated by Django 3.2.16 on 2024-01-03 06:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('hotfix', '0005_releasedhotfixlistmodule'), + ] + + operations = [ + migrations.AddField( + model_name='releasedhotfixlistmodule', + name='serious_explain', + field=models.TextField(blank=True, default='', verbose_name='推荐说明'), + ), + migrations.AlterField( + model_name='releasedhotfixlistmodule', + name='deprecated_info', + field=models.TextField(blank=True, default='', verbose_name='废弃原因或信息'), + ), + migrations.AlterField( + model_name='releasedhotfixlistmodule', + name='description', + field=models.TextField(blank=True, default='', verbose_name='hotfix问题描述'), + ), + migrations.AlterField( + model_name='releasedhotfixlistmodule', + name='download_link', + field=models.TextField(blank=True, default='', verbose_name='hotfix下载链接'), + ), + migrations.AlterField( + model_name='releasedhotfixlistmodule', + name='released_kernel_version', + field=models.CharField(max_length=180, verbose_name='hotfix发布的内核版本'), + ), + ] diff --git a/sysom_server/sysom_hotfix/apps/hotfix/migrations/0007_alter_releasedhotfixlistmodule_released_kernel_version.py b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0007_alter_releasedhotfixlistmodule_released_kernel_version.py new file mode 100644 index 0000000000000000000000000000000000000000..7d446fbd4f09d0c284da823225075daebd3f75e4 --- /dev/null +++ b/sysom_server/sysom_hotfix/apps/hotfix/migrations/0007_alter_releasedhotfixlistmodule_released_kernel_version.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.16 on 2024-01-17 06:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('hotfix', '0006_auto_20240103_1448'), + ] + + operations = [ + migrations.AlterField( + model_name='releasedhotfixlistmodule', + name='released_kernel_version', + field=models.TextField(verbose_name='hotfix发布的内核版本'), + ), + ] diff --git a/sysom_server/sysom_hotfix/apps/hotfix/models.py b/sysom_server/sysom_hotfix/apps/hotfix/models.py index 6b2ef8483d606f5541e563774154a915cfb88307..9faaef0a7080fb0552c1891ea5d40712045c7afe 100644 --- a/sysom_server/sysom_hotfix/apps/hotfix/models.py +++ b/sysom_server/sysom_hotfix/apps/hotfix/models.py @@ -60,3 +60,38 @@ class KernelVersionModel(BaseModel): class Meta: db_table = "sys_hotfix_kernelversion" ordering = ['-created_at'] + + +class ReleasedHotfixListModule(BaseModel): + serious_choice = [ + (0, u'可选安装'), + (1, u'建议安装'), + (2, u'需要安装') + ] + system_choice = [ + (0, u'调度'), + (1, u'内存'), + (2, u'网络'), + (3, u'存储'), + (4, u'其他') + ] + deprecated_choice = [ + (0, u'正常'), + (1, u'废弃') + ] + hotfix_id = models.CharField(max_length=50, verbose_name="hotfix id") + released_kernel_version = models.TextField(verbose_name="hotfix发布的内核版本") + serious = models.IntegerField(default=0, choices=serious_choice, verbose_name="hotfix推荐安装级别") + serious_explain = models.TextField(default="", verbose_name='推荐说明', blank=True) + description = models.TextField(default="", verbose_name="hotfix问题描述", blank=True) + fix_system = models.IntegerField(default=0, choices=system_choice, verbose_name="涉及子系统") + released_time = models.DateTimeField(auto_now=False, verbose_name="发布时间") + download_link = models.TextField(default="", verbose_name="hotfix下载链接", blank=True) + deprecated = models.IntegerField(default=0, choices=deprecated_choice, verbose_name="该hotfix是否被废弃") + deprecated_info = models.TextField(default="", verbose_name="废弃原因或信息", blank=True) + modified_time = models.DateTimeField(auto_now=True, verbose_name='记录修改时间') + modified_user = models.CharField(default="", max_length=20, null=True, verbose_name="用于记录最后一次修改的人") + + class Meta: + db_table = "sys_released_hotfix" + ordering = ['-created_at'] \ No newline at end of file diff --git a/sysom_server/sysom_hotfix/apps/hotfix/serializer.py b/sysom_server/sysom_hotfix/apps/hotfix/serializer.py index 251775242d3331c83bdcbcd5d7cacbe5847b9b74..b19b57ee357a193717a56a5237a5ad76aedabe6f 100644 --- a/sysom_server/sysom_hotfix/apps/hotfix/serializer.py +++ b/sysom_server/sysom_hotfix/apps/hotfix/serializer.py @@ -1,6 +1,17 @@ +import os +import re +import tempfile +import pandas as pd +import functools +from typing import List +from datetime import datetime +from pandas.core.frame import DataFrame +from pandas._libs.tslibs.timestamps import Timestamp from clogger import logger +from django.utils import timezone +from django.core.files.uploadedfile import InMemoryUploadedFile from rest_framework import serializers -from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel +from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel, ReleasedHotfixListModule class HotfixSerializer(serializers.ModelSerializer): @@ -20,3 +31,323 @@ class KernelSerializer(serializers.ModelSerializer): class Meta: model = KernelVersionModel fields = '__all__' + +class ReleasedHotfixSerializer(serializers.ModelSerializer): + + deprecated = serializers.SerializerMethodField() + modified_time = serializers.DateTimeField(format="%Y-%m-%d %H:%M:%S") + released_time = serializers.DateTimeField(format="%Y-%m-%d %H:%M:%S") + + class Meta: + model = ReleasedHotfixListModule + fields = '__all__' + + def get_deprecated(self, obj): return bool(obj.deprecated) + + +class CreateReleasedHotfixSerializer(serializers.ModelSerializer): + + deprecated = serializers.BooleanField(default=False) + deprecated_info = serializers.CharField(default="") + + class Meta: + model = ReleasedHotfixListModule + exclude = ['modified_time', 'modified_user'] + + def validate(self, attrs): + deprecated_info = attrs.get('deprecated_info', "") + deprecated = attrs.get('deprecated', False) + if not deprecated and len(deprecated_info) > 0: + raise serializers.ValidationError('Field deprecated is false, deprecated_info is empty!') + + if deprecated and len(deprecated_info) == 0: + raise serializers.ValidationError("Field deprecated is true, field deprecated_info required!") + + return super().validate(attrs) + + def validate_hotfix_id(self, attrs): + instance = ReleasedHotfixListModule.objects.filter(hotfix_id=attrs).first() + if instance is None: + return super().validate(attrs) + else: + raise serializers.ValidationError(detail='Field hotfix_id does not exist') + + def validate_released_kernel_version(self, attrs): + instance = ReleasedHotfixListModule.objects.filter(released_kernel_version=attrs).first() + if instance is None: + return super().validate(attrs) + else: + raise serializers.ValidationError(detail='Field released_kernel_version does not exist') + + +class UpdateReleasedHotfixSerializer(serializers.ModelSerializer): + released_kernel_version = serializers.CharField(required=False) + hotfix_id = serializers.CharField(required=False) + released_time = serializers.DateTimeField(required=False) + + class Meta: + model = ReleasedHotfixListModule + exclude = ['modified_time', 'modified_user'] + + def validate_hotfix_id(self, attrs): + """Filter the data named "hotfix_id" so that it cannot be repeated in the database""" + instance = ReleasedHotfixListModule.objects.filter(hotfix_id=attrs).first() + if instance: + raise serializers.ValidationError(f"field hotfix_id: {attrs} Exist") + else: + return attrs + + def validate(self, attrs): + """ + Filter the data named "released_kernel_version" + so that it cannot be repeated in the database + """ + hotfix_id=attrs.get("hotfix_id",None) + released_kernel_version=attrs.get("released_kernel_version",None) + if hotfix_id and released_kernel_version: + instance_kernel_version=ReleasedHotfixListModule.objects.filter(**attrs) + if instance_kernel_version: + raise serializers.ValidationError(f"released_kernel_version:{released_kernel_version} is already existed!") + return attrs + + +class UpdatePutReleasedHotfixSerializer(serializers.ModelSerializer): + deprecated = serializers.BooleanField() + + class Meta: + model = ReleasedHotfixListModule + exclude = ['modified_time', 'modified_user'] + + def validate_deprecated(self, attrs): + return 1 if attrs else 0 + + def validate(self, attrs): + deprecated_info = attrs.get('deprecated_info', "") + deprecated = attrs.get('deprecated', False) + if not deprecated and len(deprecated_info) > 0: + raise serializers.ValidationError('Field deprecated is false, deprecated_info is empty!') + + if deprecated and len(deprecated_info) == 0: + raise serializers.ValidationError("Field deprecated is true, field deprecated_info required!") + return super().validate(attrs) + + +class BulkImportHotfixReleasedSerializer(serializers.Serializer): + file = serializers.FileField(required=True) + + def __init__(self, instance=None, data=..., **kwargs): + super().__init__(instance, data, **kwargs) + self._file: InMemoryUploadedFile = None + self._save_temporary_file = None + self._suffix: str = None + self._save_file_path = None + self._default_save_path = '/tmp/' + self._suffixs = ['xls', 'csv', 'xlsx'] + self._file_header_fields = [ + "hotfix_id", "released_time", "released_kernel_version" + ] + self._error_download_link = [] + self._action = { + "csv": functools.partial(pd.read_csv), + "xls": functools.partial(pd.read_excel, keep_default_na=False, engine="xlrd"), + "xlsx": functools.partial(pd.read_excel, keep_default_na=False, engine="openpyxl"), + } + + def validate_file(self, file: InMemoryUploadedFile): + """validate param `file`""" + file_name = file.name + self._suffix = file_name.split('.')[-1] + + if self._suffix not in self._suffixs: + raise serializers.ValidationError('file suffix invaild!') + self._file = file + + self._save_upload_file() + try: + context = self._file_parse() + except Exception as e: + raise serializers.ValidationError(f'parse file faid! Error: {e}') + finally: + self.close_file() + return context + + def _save_upload_file(self): + """Save the file to a temporary file""" + self._save_temporary_file = tempfile.NamedTemporaryFile( + dir=self._default_save_path, + suffix=f".{self._suffix}", + ) + self._save_file_path = os.path.join(self._default_save_path, self._save_temporary_file.name) + + with open(self._save_file_path, 'wb') as f: + for chunk in self._file.chunks(chunk_size=1024): + f.write(chunk) + + def _file_parse(self) -> DataFrame: + """parse the file""" + if self._suffix is None: + raise serializers.ValidationError("file suffix is not none!") + + if self._save_file_path is None: + raise serializers.ValidationError('file save file is not none!') + return self._action[self._suffix](self._save_file_path) + + def _kernel_version_or_download_link_map( + self, + kernel_versions: List[str], + urls: List[str] + ) -> List[dict]: + ''' + @Description: Find the corresponding download link in the + download links by using the key characters in the kernel- + version + @Params `kernel_version`: Type(List) Store the kernel ve- + rsion list + @Params `urls`: Type(List) Store the donwload_link list + @return {*} + ''' + _version_download_link_list = list() + reg_compile = re.compile( + "(\d+)\.(\d+)\.(\d+)-(\d+)\.?(\d+){0,1}(?!_)" + ) + + def _filter(marks: List[str], url: str): + ''' + @Description: + @param {List} marks + @param {str} url + @return {*} + ''' + flag = True + for mark in marks: + if mark not in url: + flag = False + break + return flag + + def _filter_subset(filter_sub : List, kernel_version_flag): + """ + filter out the exact downloadlink + @ filter_flag: ['http:', '', 'yum.tbsite.net', 'alios', '7u2', 'os', 'aarch64', ※'kernel-hotfix-CVE-2023-0461-5.10.134-13' ※, + 'kernel-hotfix-CVE-2023-0461-5.10.134-13-1.0-20230316152330.al8.aarch64.rpm'] + """ + for kernel_link in filter_sub: + filter_flag = kernel_link.split("/")[7] + filter_flag_fin = re.search(reg_compile, filter_flag).group(0) + if kernel_version_flag == filter_flag_fin: + return kernel_link + + for kernel_version in kernel_versions: + v = re.search(reg_compile, kernel_version).groups() + version_s = re.sub(reg_compile, "", kernel_version) + marks = version_s.split(".")[1:] + marks.append(v) + marks = re.findall(r"([\d\-\.]+)\.((\w+)\.)+(\w+)", kernel_version)[0] # tuple + # [5.10.23-5 , al8 , x86_64] + filter_sub = list(filter(lambda x: _filter(marks, x), urls)) + if len(filter_sub) >= 1: + subset_result = filter_sub[0] if len(filter_sub) == 1 else _filter_subset(filter_sub, marks[0]) #!!! + _version_download_link_list.append((kernel_version, subset_result)) + return _version_download_link_list + + def create(self, validated_data): + """Save the data to the database""" + def _released_time(released_time): + """ + @Description: released time handler + @param {*} released_time: `released_time` type is string or DateTime or empty string + @return {*} released_time: type(datetime) + """ + if isinstance(released_time, str) and released_time != "": + released_time = datetime.strptime( + released_time.split(".")[0], "%Y-%m-%d %H:%M:%S" + ) + released_time = timezone.get_current_timezone().localize(released_time) + elif isinstance(released_time, Timestamp): + released_time = timezone.get_current_timezone().localize(released_time) + else: + released_time = timezone.now() + return released_time + + def _structure_released_hotfix(**data_rows: dict) -> List[ReleasedHotfixListModule]: + """ + initizaer released hotfix model + """ + def _release_kernel_version_model(hotfix_id, released_kernel_version, released_time, **kwargs): + try: + ReleasedHotfixListModule.objects.get( + hotfix_id=hotfix_id, released_kernel_version=released_kernel_version) + return None + except ReleasedHotfixListModule.DoesNotExist: + return ReleasedHotfixListModule( + hotfix_id=hotfix_id, released_kernel_version=released_kernel_version, + released_time=released_time, **kwargs + ) + + released_hotfix_list = list() + hotfix_id = data_rows.pop("hotfix_id") + released_time = _released_time(data_rows.pop("released_time")) + released_kernel_version: str = data_rows.pop("released_kernel_version") + + model_fields_list = [ + field.name + for field in ReleasedHotfixListModule._meta.get_fields() + ] + + _other_kwargs = { + k: v + for k, v in data_rows.items() + if k in model_fields_list\ + and ( + v is not None + and + v != "" + ) + } + + kernel_versions = released_kernel_version.split(",") + if len(kernel_versions) == 1: + _model = _release_kernel_version_model( + hotfix_id, released_kernel_version, released_time, **_other_kwargs + ) + released_hotfix_list.append(_model) if _model is not None else ... + else: + download_link: str = _other_kwargs.get("download_link") + download_links = download_link.split(" ") + if download_link is None or len(download_links) == 1: + for kernel_version in kernel_versions: + _model = _release_kernel_version_model( + hotfix_id, kernel_version, released_time, **_other_kwargs + ) + released_hotfix_list.append(_model) if _model is not None else ... + else: + for item in self._kernel_version_or_download_link_map( + kernel_versions, download_links + ): + kernel_version, download_link = item + _other_kwargs.update({"download_link": download_link}) + _model = _release_kernel_version_model( + hotfix_id, kernel_version, released_time, **_other_kwargs + ) + released_hotfix_list.append(_model) if _model is not None else ... + return released_hotfix_list + + excel_file_content: DataFrame = validated_data.get("file") + for param in [ + field for field in self._file_header_fields\ + if field not in excel_file_content.columns.values + ]: + raise serializers.ValidationError( + f"excel tabel header required include {param} field" + ) + + for data in excel_file_content.to_dict(orient='records'): + ReleasedHotfixListModule.objects.bulk_create( + _structure_released_hotfix(**data) + ) + return [] + + def close_file(self): + """delete the temporary file!""" + if self._save_temporary_file: + self._save_temporary_file.close() diff --git a/sysom_server/sysom_hotfix/apps/hotfix/urls.py b/sysom_server/sysom_hotfix/apps/hotfix/urls.py index 0762e342bc12efa6770f1faf3a6eba74ba9ab9ec..875623c80965151e5c583d349cb971cbe0edaf39 100644 --- a/sysom_server/sysom_hotfix/apps/hotfix/urls.py +++ b/sysom_server/sysom_hotfix/apps/hotfix/urls.py @@ -1,12 +1,13 @@ -from django.urls import path +from django.urls import path, include, re_path from django.urls.conf import include from rest_framework.routers import DefaultRouter from apps.hotfix import views -router = DefaultRouter() +router = DefaultRouter() router.register('hotfix', views.HotfixAPIView) +# router.register('releasehotfix', views.ReleaseHotfixListAPIView, basename='releasehotfix') urlpatterns = [ path('api/v1/hotfix/create_hotfix/', views.HotfixAPIView.as_view({'post': 'create_hotfix'})), @@ -34,5 +35,10 @@ urlpatterns = [ path('api/v1/hotfix/rebuild_hotfix/',views.HotfixAPIView.as_view({'post': 'rebuild_hotfix'})), path('api/v1/hotfix/oneclick_deploy/', views.HotfixAPIView.as_view({'post': 'oneclick_deploy'})), path('api/v1/hotfix/health_check/', views.HealthViewset.as_view({'get': 'health_check'})), + path('api/v1/hotfix/get_released_hotfixs/', views.ReleaseHotfixListAPIView.as_view({'get': 'get_filter_released_hotfixs'})), + path('api/v1/hotfix/insert_released_hotfix_info/', views.ReleaseHotfixListAPIView.as_view({'post': 'add_one_released_hotfix'})), + path('api/v1/hotfix/import_from_tablefiles/', views.ReleaseHotfixListAPIView.as_view({'post': 'import_from_table_v2'})), + path('api/v1/hotfix/update_put_released_hotfix_info//', views.ReleaseHotfixListAPIView.as_view({'put': 'update_released_hotfix_record'})), + path('api/v1/hotfix/update_patch_released_hotfix_info//', views.ReleaseHotfixListAPIView.as_view({'patch': 'update_released_hotfix_record'})), path('api/v1/', include(router.urls)), ] diff --git a/sysom_server/sysom_hotfix/apps/hotfix/views.py b/sysom_server/sysom_hotfix/apps/hotfix/views.py index 2f7b325fdd9ab3fe773eca48df6f5b6119d482be..268d81cec4a1579dde75809cd55886ac7b69322a 100644 --- a/sysom_server/sysom_hotfix/apps/hotfix/views.py +++ b/sysom_server/sysom_hotfix/apps/hotfix/views.py @@ -16,11 +16,14 @@ from django_filters.rest_framework import DjangoFilterBackend from rest_framework.exceptions import ValidationError from django.conf import settings from rest_framework.viewsets import GenericViewSet -import re +import sys +import pandas as pd from apps.hotfix import serializer -from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel +from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel, ReleasedHotfixListModule +from apps.hotfix.filters import HotfixReleasedFilter from lib.response import * +from lib.paginations import Pagination from lib.utils import human_datetime, datetime, datetime_str from lib.exception import APIException from lib.base_view import CommonModelViewSet @@ -91,7 +94,6 @@ class SaveUploadFile(APIView): raise APIException(message=f"Upload Failed: {e}") return success(result={"patch_name":patch_file_name}, message="Upload success") - class HotfixAPIView(GenericViewSet, mixins.ListModelMixin, mixins.RetrieveModelMixin, @@ -446,7 +448,7 @@ class HotfixAPIView(GenericViewSet, debuginfo_link = request.data['debuginfo_link'] os_type = request.data['os_type'] image = request.data['image'] - use_source = request.data['use_source'] + use_src_package = request.data['use_src_package'] if len(kernel_version)>0 and len(source)>0 and len(devel_link)>0 and len(debuginfo_link)>0 and len(os_type)>0 and len(image) > 0: try: @@ -459,7 +461,7 @@ class HotfixAPIView(GenericViewSet, devel_link = devel_link, debuginfo_link = debuginfo_link, image = image, - use_source = use_source + use_src_package = use_src_package ) else: return other_response(message="same kernel version found in record...") @@ -558,14 +560,6 @@ class HotfixAPIView(GenericViewSet, os_type_object.source_debuginfo = request.data['source_debuginfo'] # src_pkg_mark = request.data.get("src_pkg_mark", None) git_rule = request.data.get("git_rule", None) - logger.info(os_type_object.git_rule) - logger.info(git_rule) - if git_rule != os_type_object.git_rule: - patch_file_repo = os.path.join(settings.HOTFIX_FILE_BRANCH_RULE) - file_path = os.path.join(patch_file_repo, git_rule) - logger.info(file_path) - os.remove(file_path) - os_type_object.git_rule = request.data['git_rule'] os_type_object.save() thread_runner = threading.Thread(target=self.function.sync_kernel, name="sync_kernel",args=(os_type_object.id,)) thread_runner.start() @@ -608,4 +602,67 @@ class HotfixAPIView(GenericViewSet, class HealthViewset(CommonModelViewSet): def health_check(self, request, *args, **kwargs): - return success(result={}) \ No newline at end of file + return success(result={}) + + +class ReleaseHotfixListAPIView(GenericViewSet, + mixins.ListModelMixin, + mixins.RetrieveModelMixin, + mixins.CreateModelMixin, + mixins.UpdateModelMixin, + mixins.DestroyModelMixin): + queryset = ReleasedHotfixListModule.objects.all() + pagination_class = Pagination + serializer_class = serializer.ReleasedHotfixSerializer + filter_class = HotfixReleasedFilter + filter_backends = [DjangoFilterBackend] + http_method_names = ['get', 'post', 'patch', 'put', 'delete'] + + def get_serializer_class(self): + """appoint method serializer""" + request_method_dict = { + "POST": serializer.CreateReleasedHotfixSerializer, + "PUT": serializer.UpdatePutReleasedHotfixSerializer, + "PATCH": serializer.UpdateReleasedHotfixSerializer, + "GET": serializer.ReleasedHotfixSerializer, + } + return request_method_dict[self.request.method] + + """this function returns the filtered records in the database + """ + def get_filter_released_hotfixs(self, request: Request, *args, **kwargs): + queryset = self.filter_queryset(self.get_queryset()) + if not queryset: + return success([], total=0) + return super().list(request, *args, **kwargs) + + """insert one released hotfix record into database + """ + def add_one_released_hotfix(self, request, *args, **kwarg): + create_serializer = self.get_serializer(data=request.data) + create_serializer.is_valid(raise_exception=True) + self.perform_create(create_serializer) + + ser = serializer.ReleasedHotfixSerializer(create_serializer.instance) + return success(result=ser.data) + + """this function is used to update one hotfix record + However, the information inside just can update, but cannot delete! + You can update it into blank message " ", but I hope this message should never be deleted! + """ + def update_released_hotfix_record(self, request, *args, **kwargs): + """appoint patch method""" + partial = kwargs.pop('partial', False) + instance = self.get_object() + update_serializer = self.get_serializer(instance, data=request.data, partial=partial) + update_serializer.is_valid(raise_exception=True) + self.perform_update(update_serializer) + + ser = serializer.ReleasedHotfixSerializer(update_serializer.instance, many=False) + return success(result=ser.data) + + def import_from_table_v2(self, request): + ser = serializer.BulkImportHotfixReleasedSerializer(data=request.data) + ser.is_valid(raise_exception=True) + self.perform_create(ser) + return success(result={},message="save files successful!") diff --git a/sysom_server/sysom_hotfix/lib/function.py b/sysom_server/sysom_hotfix/lib/function.py index 63c3a3687349cfcc69f87db4903de34abfd7ba5d..4ab2867937f99252653a270443f5adf3b4f48cd9 100644 --- a/sysom_server/sysom_hotfix/lib/function.py +++ b/sysom_server/sysom_hotfix/lib/function.py @@ -11,7 +11,7 @@ import requests import json import time -from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel +from apps.hotfix.models import HotfixModel, OSTypeModel, KernelVersionModel, ReleasedHotfixListModule from cec_base.producer import dispatch_producer, Producer from cec_base.consumer import Consumer, dispatch_consumer from cec_base.admin import dispatch_admin @@ -426,6 +426,27 @@ class FunctionClass(): os_type_object.save() except Exception as e: logger.error(e) + + def query_released_hotfix_by_para(self, search_hotfix_id,search_kernel_version, + search_serious, search_released_time, search_fix_system): + try: + objects = ReleasedHotfixListModule.objects.all() + if search_hotfix_id is not None: + objects = objects.filter(hotfix_id=search_hotfix_id) + if search_kernel_version is not None: + objects = objects.filter(released_kernel_version=search_kernel_version) + if search_serious is not None: + objects = objects.filter(serious=search_serious) + if search_released_time is not None: + objects = objects.filter(released_time = search_released_time) + if search_fix_system is not None: + objects = objects.filter(fix_system=search_fix_system) + return objects + except Exception as e: + logger.error("Error when filtering released hotfix database") + logger.error("query_released_hotfix_by_para: %s " % str(e)) + return None + """ CECListener listen topic of hotfix_msg, which is send by builder @@ -533,4 +554,15 @@ class CECListener(): except Exception as e: logger.error(str(e)) - \ No newline at end of file + +""" +Hotfix Server Exception +""" +class HotfixServerException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) + + @staticmethod + def msg(self, msg: str) -> str: + return msg \ No newline at end of file diff --git a/sysom_server/sysom_monitor_server/config.yml b/sysom_server/sysom_monitor_server/config.yml index b96a9c04d545240d27d4ef227183a79ff8716990..0e941feda96e08753606e7263e997f29310c0d1c 100644 --- a/sysom_server/sysom_monitor_server/config.yml +++ b/sysom_server/sysom_monitor_server/config.yml @@ -1,8 +1,8 @@ vars: NODE_EXPORT_BASE_DOWNLOAD_URL: &NODE_EXPORT_BASE_DOWNLOAD_URL https://sysom.oss-cn-beijing.aliyuncs.com/monitor/ NODE_EXPORT_VERSION: &NODE_EXPORT_VERSION 1.5.0 - SYSAK_DOWNLOAD_URL: &SYSAK_DOWNLOAD_URL https://mirrors.openanolis.cn/sysak/packages/release-v2.2.0/ - SYSAK_VERSION: &SYSAK_VERSION 2.2.0-1 + SYSAK_DOWNLOAD_URL: &SYSAK_DOWNLOAD_URL https://mirrors.openanolis.cn/sysak/packages/release-v2.4.0/ + SYSAK_VERSION: &SYSAK_VERSION 2.4.0-1 SERVICE_NAME: &SERVICE_NAME sysom_monitor_server SERVICE_CONSUMER_GROUP: !concat &SERVICE_CONSUMER_GROUP [*SERVICE_NAME, "_consumer_group"] diff --git a/sysom_server/sysom_monitor_server/scripts/node_init.sh b/sysom_server/sysom_monitor_server/scripts/node_init.sh index 8f292ebdaac7cd6a197a6221d2114f84e7caae9e..efbff7a97c37e789205685d8db8487bf55c19c4d 100755 --- a/sysom_server/sysom_monitor_server/scripts/node_init.sh +++ b/sysom_server/sysom_monitor_server/scripts/node_init.sh @@ -2,7 +2,7 @@ RESOURCE_DIR=${NODE_HOME}/${SERVICE_NAME} if [ "$SYSAK_VERTION" == "" ]; then - export SYSAK_VERTION=2.2.0-1 + export SYSAK_VERTION=2.4.0-1 fi if [ "$NODE_EXPORT_VERSION" == "" ]; then export NODE_EXPORT_VERSION=1.5.0 diff --git a/sysom_server/sysom_vul/apps/vul/async_fetch.py b/sysom_server/sysom_vul/apps/vul/async_fetch.py index 56df8b4b27d9f46cd753b9ce526b6f6b51b470f6..1eb8a46f50fe1072e5835ea7f8fd035265da1ac1 100644 --- a/sysom_server/sysom_vul/apps/vul/async_fetch.py +++ b/sysom_server/sysom_vul/apps/vul/async_fetch.py @@ -110,6 +110,7 @@ class FetchVulData: 'params': json.loads(instance.params), 'auth': auth } + 返回参数对象 """ kwargs = dict() @@ -132,6 +133,14 @@ class FetchVulData: @classmethod def _get_page_total_num(cls, kwargs) -> Union[bool, int]: + """向漏洞库请求数据 + + Args: + kwargs (_type_): 创建的结构化参数 + + Returns: + Union[bool, int]: 如果请求成功,返回请求数据的从页;失败返回False + """ response = requests.request(**kwargs) if response.status_code == 200: result = response.json() diff --git a/sysom_server/sysom_vul/apps/vul/vul.py b/sysom_server/sysom_vul/apps/vul/vul.py index aaf7289a7a933ad25d1556d37eb6cfd6961aef6a..c470c8ef954ce8e21ff236d261d6e28a4ee4785d 100644 --- a/sysom_server/sysom_vul/apps/vul/vul.py +++ b/sysom_server/sysom_vul/apps/vul/vul.py @@ -39,21 +39,23 @@ def update_vul_db(): 更新漏洞数据库数据 """ logger.info("Begin to get vul db address") - vul_addrs = VulAddrModel.objects.all() + vul_addrs = VulAddrModel.objects.all() # 获取漏洞数据库中所有的漏洞库信息 for vul_addr in vul_addrs: logger.info("Try to get vul db info") - vul_addr_obj = VulDataParse(vul_addr) + vul_addr_obj = VulDataParse(vul_addr) # 生成漏洞库操作实例 try: - for res in vul_addr_obj._get_vul_data(): + for res in vul_addr_obj._get_vul_data(): # 获取每一项cve数据,解析返回数据,根据cve是否存在进行更新或者插入 vul_addr_obj.parse_and_store_vul_data(res) except Exception as e: logger.warning(e) logger.warning(f"failed in {vul_addr.url}") - + """ + VulDataParse:CVE漏洞库操作实例 + """ class VulDataParse(object): def __init__(self, vul_addr_obj: VulAddrModel): - self.vul_addr_obj = vul_addr_obj + self.vul_addr_obj = vul_addr_obj self.cve_data_path = list(filter(None, self._parser_cve_item_path)) @property diff --git a/sysom_web/config/routes.js b/sysom_web/config/routes.js index 1fc6070cbecc0c920343da21a353cf3249a4aec3..3232d8ccd3538f0352fa2b5da7c6533f998dd040 100644 --- a/sysom_web/config/routes.js +++ b/sysom_web/config/routes.js @@ -157,29 +157,34 @@ export default [ name: 'app_observable', access: 'canAdmin', routes: [ - { - path: '/app_observable', - redirect: "/app_observable/net_topo", - }, - { - path: '/app_observable/net_topo', - name: 'net_topo', - component: "./app_observable/net_topo" - }, - { - path: '/app_observable/mysql', - name: 'mysql', - component: "./app_observable/mysql" - }, - { - path: "/app_observable/java", - redirect: '/app_observable/process', - }, - { - path: "/app_observable/process", - name: "process", - component: "./app_observable/process" - } + // { + // path: '/app_observable', + // redirect: "/app_observable/net_topo", + // }, + // { + // path: '/app_observable/net_topo', + // name: 'net_topo', + // component: "./app_observable/net_topo" + // }, + // { + // path: '/app_observable/mysql', + // name: 'mysql', + // component: "./app_observable/mysql" + // }, + // { + // path: "/app_observable/java", + // redirect: '/app_observable/process', + // }, + // { + // path: "/app_observable/process", + // name: "process", + // component: "./app_observable/process" + // }, + // { + // path: "/app_observable/nginx", + // name: "nginx", + // component: "./app_observable/nginx" + // } ], }, { @@ -221,69 +226,18 @@ export default [ routes: [ { path: '/diagnose', - redirect: '/diagnose/oscheck', - }, - { - path: '/diagnose/oscheck', - name: 'oscheck', - component: './diagnose/oscheck', - }, - { - path: '/diagnose/cpu', - name: 'cpu', - routes: [ - { - path: '/diagnose/cpu', - redirect: '/diagnose/cpu/loadtask', - } - ] - }, - { - path: '/diagnose/storage', - name: 'storage', - routes: [ - { - path: '/diagnose/storage', - redirect: '/diagnose/storage/iolatency', - } - ] - }, - { - path: '/diagnose/net', - name: 'net', - routes: [ - { - path: '/diagnose/net', - redirect: '/diagnose/net/pingtrace', - }, - ] - }, - { - path: '/diagnose/memory', - name: 'memory', - routes: [ - { - path: '/diagnose/memory', - redirect: '/diagnose/memory/memgraph', - } - ] + redirect: '/diagnose/ossre', }, { path: '/diagnose/detail/:task_id?', layout: false, component: "./diagnose/detail" }, - // { - // path: '/diagnose/custom', - // name: 'custom', - // routes: [ - // { - // path: '/diagnose/custom/pannel', - // name: 'pannel', - // component: './diagnose/generate/Pannel/index', - // } - // ] - // } + { + path: '/diagnose/query', + name: 'query', + component: './diagnose/query' + }, ], }, { @@ -362,6 +316,11 @@ export default [ component: './hotfix/Version/VersionCustomize', }, ] + }, + { + path: '/hotfix/released', + name: 'released_hotfix', + component: './hotfix/Released' } ] }, diff --git a/sysom_web/cypress.config.js b/sysom_web/cypress.config.js index d2a8834d6a638638d768631b1b35fd91ac3e4e44..4499d57c9075d74d0938f9e957854f5a54818323 100644 --- a/sysom_web/cypress.config.js +++ b/sysom_web/cypress.config.js @@ -5,9 +5,12 @@ module.exports = defineConfig({ baseUrl: "http://localhost:8000", // baseUrl: "http://sysom_dev.qjm253.cn", experimentalStudio: true, + chromeWebSecurity: false, env: { SYSOM_ACCOUNT_USERNAME: "admin", - SYSOM_ACCOUNT_PASSWORD: "123456" + SYSOM_ACCOUNT_PASSWORD: "123456", + HOSTS: ['127.0.0.1', '127.0.0.1'], + DEFAULT_HOST_PASSWORD: "123456" }, setupNodeEvents(on, config) { // implement node event listeners here diff --git a/sysom_web/cypress/e2e/account/login.cy.js b/sysom_web/cypress/e2e/account/login.cy.js index 2e676dafcfef0634961f414b48edfcfd2451b9cf..e1380737f672678ce2e364ad086e1352885d9e0e 100644 --- a/sysom_web/cypress/e2e/account/login.cy.js +++ b/sysom_web/cypress/e2e/account/login.cy.js @@ -1,13 +1,5 @@ /// describe("SysOM Login Page Test", () => { - it("login failed", () => { - cy.visit("/user/login") - cy.get("#username").focus().type("admin") - cy.get("#password").focus().type("123456") - cy.get("button").contains("登录").click() - - cy.get("button").contains("忽 略").click() - // cy.get("#username").focus() - }) + it("login failed", () => { cy.login() }) }) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/alarm/alarm_list.cy.js b/sysom_web/cypress/e2e/alarm/alarm_list.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..3f5f023befb16b53d6f72c017a143d980fdbbe29 --- /dev/null +++ b/sysom_web/cypress/e2e/alarm/alarm_list.cy.js @@ -0,0 +1,86 @@ +/// + + +describe("SysOM Alarm Manager Test", () => { + beforeEach(() => { + cy.login() + }) + it.only("alarm list", () => { + cy.intercept("GET", "/api/v1/alarm/list?*").as("getAlarmList") + + cy.visit("alarm/list") + + cy.wait("@getAlarmList").then((interception) => { + expect(interception).to.have.property('response') + expect(interception.response?.body.code, 'code').to.equal(200) + expect(interception.response.statusCode).to.equal(200) + + // cy.get('.ant-table-tbody').find('tr').should("have.length.gte", 0) + cy.wait(1000) + cy.get('.ant-table-content').find('table').then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + // 断言告警列表数据内容大于等于1 + cy.wrap($el).find('.ant-table-tbody').find('tr').should('have.length.gte', 1) + + // 断言告警表格有11个字段 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').should('have.length.gte', 9) + + const td = cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td') + // 断言告警ID格式是否为UUID V4 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(2).find('span').invoke('text').should('match', /^[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-f]{12}$/i) + + // 断言告警级别是否在枚举类型中 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(5).invoke("text").then((text) => { + expect(text.trim()).to.be.oneOf(['监控告警', '应用告警', '其他告警']) + }) + + // 断言告警级别是否在枚举类型中 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(6).invoke("text").then((text) => { + expect(text.trim()).to.be.oneOf(['严重', '警告', '错误']) + }) + + // 断言告警处理状态是否在枚举类型中 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(8).invoke("text").then((text) => { + expect(text.trim()).to.be.oneOf(['已读', '未读']) + }) + + // 断言告警时间格式是否正确 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(9).invoke("text").should('match', /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/) + } + }) + }) + + cy.wait(1000) + cy.get('.ant-pro-form-collapse-button').contains("展开").click() + + cy.get('#alert_level').click() + cy.get(".rc-virtual-list-holder-inner").contains("警告").click() + + cy.get('#alert_category').click() + cy.get(".rc-virtual-list-holder-inner").contains("应用告警").click() + + cy.get('#deal_status').click() + cy.get(".rc-virtual-list-holder-inner").contains("未读").click() + + // 点击查询 + cy.get(':nth-child(2) > .ant-btn').click() + + cy.wait(1000) + + cy.get('.ant-table-tbody').then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("tr").should("have.length.gte", 1) + cy.wrap($el).find("tr").eq(0).contains("标记已读").click() + } + }) + + // 点击重置 + cy.get('.ant-space > :nth-child(1) > .ant-btn').click() + // 点击收起 + cy.get('.ant-pro-form-collapse-button').contains("收起").click() + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/app_observable/java_process.cy.js b/sysom_web/cypress/e2e/app_observable/java_process.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..629f88b0b4ff8a9170c50f5f0ed6686773493a5e --- /dev/null +++ b/sysom_web/cypress/e2e/app_observable/java_process.cy.js @@ -0,0 +1,142 @@ +/// + +describe("SysOM Migration Monitor Dashboard Test", () => { + beforeEach(() => { + cy.login() + }) + + it("Migration monitor test", () => { + // 1. 访问java可观测 + cy.visit("/app_observable/process_app"); + + // 2. 等待页面加载完成 + cy.wait(1000); + + // 运行时间(Stat面板数值类型) + cy.getPannelContentByTitle("运行时间") + .then(($el) => { + if ($el.text().includes("No data")) { + // 面板没有数据的情况 + cy.wrap($el).contains("No data"); + } + else { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + } + }); + + // 虚拟内存(Stat面板数值类型) + cy.getPannelContentByTitle("虚拟内存") + .then(($el) => { + if ($el.text().includes("No data")) { + // 面板没有数据的情况 + cy.wrap($el).contains("No data"); + } + else { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + } + }); + + // 最大时延(Stat面板数值类型) + cy.getPannelContentByTitle("最大延时") + .then(($el) => { + if ($el.text().includes("No data")) { + // 面板没有数据的情况 + cy.wrap($el).contains("No data"); + } + else { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + } + }); + + // cpu(Pie Chart) + cy.getPannelContentByTitle("cpu").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + // 内存(Pie Chart) + cy.getPannelContentByTitle("内存").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + // cpu占用率(Pie Chart) + cy.getPannelContentByTitle("cpu占比").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + // IO 吞吐量(Time series 面板) + cy.getPannelContentByTitle("IO 吞吐量").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("IO 吞吐量").find("tbody tr").eq(0).find("td").eq(0).contains("read_bytes"); // 第一列的第一行是 read_bytes + cy.getPannelContentByTitle("IO 吞吐量").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + cy.getPannelContentByTitle("IO 吞吐量").find("tbody tr").eq(1).find("td").eq(0).contains("write_bytes"); // 第一列的第二行是 write_bytes + cy.getPannelContentByTitle("IO 吞吐量").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + + // cpu占用率(Time series 面板) + cy.getPannelContentByTitle("cpu占用率折线图").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("cpu占用率折线图").find("tbody tr").eq(0).find("td").eq(0).contains("内核态cpu_sys"); // 第一列的第一行是 内核态cpu_sys + cy.getPannelContentByTitle("cpu占用率折线图").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + cy.getPannelContentByTitle("cpu占用率折线图").find("tbody tr").eq(1).find("td").eq(0).contains("用户态cpu_user"); // 第一列的第二行是 用户态cpu_user + cy.getPannelContentByTitle("cpu占用率折线图").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + + // 进程主次缺页次数(Time series 面板) + cy.getPannelContentByTitle("进程主次缺页次数").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("进程主次缺页次数").find("tbody tr").eq(0).find("td").eq(0).contains("主缺页次数majflt"); // 第一列的第一行是 主缺页次数majflt + cy.getPannelContentByTitle("进程主次缺页次数").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + cy.getPannelContentByTitle("进程主次缺页次数").find("tbody tr").eq(1).find("td").eq(0).contains("次缺页次数minflt"); // 第一列的第二行是 次缺页次数minflt + cy.getPannelContentByTitle("进程主次缺页次数").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + + // IO延迟(Time series 面板) + cy.getPannelContentByTitle("IO延迟").find("tbody tr").should("have.length", 1); // Legend 有两列 + cy.getPannelContentByTitle("IO延迟").find("tbody tr").eq(0).find("td").eq(0).contains("IO延迟"); // 第一列的第一行是 IO延迟 + cy.getPannelContentByTitle("IO延迟").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + + // 进程切换(Time series 面板) + cy.getPannelContentByTitle("进程切换").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("进程切换").find("tbody tr").eq(0).find("td").eq(0).contains("进程自主切换"); // 第一列的第一行是 进程自主切换 + cy.getPannelContentByTitle("进程切换").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + cy.getPannelContentByTitle("进程切换").find("tbody tr").eq(1).find("td").eq(0).contains("进程非自主切换"); // 第一列的第二行是 进程非自主切换 + cy.getPannelContentByTitle("进程切换").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + + // 进程调度状态(Time series 面板) + cy.getPannelContentByTitle("进程调度状态").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("进程调度状态").find("tbody tr").eq(0).find("td").eq(0).contains("运行时间time"); // 第一列的第一行是 运行时间time + cy.getPannelContentByTitle("进程调度状态").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + cy.getPannelContentByTitle("进程调度状态").find("tbody tr").eq(1).find("td").eq(0).contains("调度延迟delay"); // 第一列的第二行是 调度延迟delay + cy.getPannelContentByTitle("进程调度状态").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.least(0); + }); + + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/diagnosis/colocation_cpi.cy.js b/sysom_web/cypress/e2e/diagnosis/colocation_cpi.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..9e48a65fcc1fe947f59f809edeffed6129295a17 --- /dev/null +++ b/sysom_web/cypress/e2e/diagnosis/colocation_cpi.cy.js @@ -0,0 +1,29 @@ +/// + +describe("SysOM Cluster Manager Test", () => { + beforeEach(() => { + // 自动登录 + cy.login() + }) + it("Invoke colocation cpi diagnosis, and check result", () => { + cy.sysomDiagnosisCheck( + // 诊断前端url + "/diagnose/colocation/cpi", + + // 诊断参数 + { + "instance": "127.0.0.1", + "moment": "2024-03-07 10:59:58", + }, + + // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) + (result) => { + cy.get('.ant-pro-card').eq(3).contains('诊断结果汇总') + cy.get('.ant-pro-card').eq(3).find(".ant-pro-card-body").contains('结论') + cy.get('.ant-pro-card').eq(3).find(".ant-pro-card-body").contains('修复建议') + cy.get('.ant-pro-card').eq(4).contains('容器信息') + cy.get('.ant-pro-card').eq(6).contains('时序信息') + // cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Event overview"); + }) + }) +}) diff --git a/sysom_web/cypress/e2e/diagnosis/colocation_servutil.cy.js b/sysom_web/cypress/e2e/diagnosis/colocation_servutil.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..3fef58ec2e503d1db466398062ef596b73756f90 --- /dev/null +++ b/sysom_web/cypress/e2e/diagnosis/colocation_servutil.cy.js @@ -0,0 +1,28 @@ +/// + +describe("SysOM Cluster Manager Test", () => { + beforeEach(() => { + // 自动登录 + cy.login() + }) + it("Invoke colocation cpi diagnosis, and check result", () => { + cy.sysomDiagnosisCheck( + // 诊断前端url + "/diagnose/colocation/serveutil", + + // 诊断参数 + { + "instance": "127.0.0.1", + "moment": "2024-03-05 14:15:49", + }, + + // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) + (result) => { + cy.get('.ant-pro-card').eq(3).contains('诊断结果汇总') + cy.get('.ant-pro-card').eq(3).find(".ant-pro-card-body").contains('结论') + cy.get('.ant-pro-card').eq(3).find(".ant-pro-card-body").contains('修复建议') + cy.get('.ant-pro-card').eq(4).contains('受干扰容器详情') + // cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Event overview"); + }) + }) +}) diff --git a/sysom_web/cypress/e2e/diagnosis/custom_command.cy.js b/sysom_web/cypress/e2e/diagnosis/custom_command.cy.js deleted file mode 100644 index 3a6d6946d68485431d58abac3cde87c36b401b86..0000000000000000000000000000000000000000 --- a/sysom_web/cypress/e2e/diagnosis/custom_command.cy.js +++ /dev/null @@ -1,81 +0,0 @@ -/// - -describe("SysOM Cluster Manager Test", () => { - beforeEach(() => { - // 自动登录 - cy.login() - }) - it.only("Invoke command diagnosis, and check result", () => { - cy.sysomDiagnosisCheck( - // 诊断前端url - "/diagnose/custom/command", - - // 诊断参数 - { - "instance": "127.0.0.1", - "command": "ls -ltrh" - }, - - // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) - // { - // "id": 83, - // "created_at": "2023-09-11 17:34:59", - // "updated_at": "2023-09-11 17:34:59", - // "task_id": "d4KvXgub", - // "status": "Success", - // "service_name": "command", - // "code": 0, - // "err_msg": "", - // "result": { - // "CommandResult": { - // "data": [ - // { - // "key": "", - // "value": "total 600M\n-rw-r--r-- 1 root root 45M Jun 26 22:17 sysom.tar.gz\ndrwxr-xr-x 14 root root 4.0K Aug" - // } - // ] - // } - // }, - // "params": { - // "service_name": "command", - // "instance": "127.0.0.1", - // "command": "ls -ltrh" - // }, - // "created_by": 1, - // "url": "/diagnose/detail/d4KvXgub" - // } - (result) => { - // result => 包含诊断API返回的诊断详情数据 - - //////////////////////////////////////////////////////////// - // 在此处补充诊断详情渲染后的前端页面是否符合预期 - // 断言文档:https://docs.cypress.io/guides/references/assertions#Text-Content - //////////////////////////////////////////////////////////// - /* ==== Generated with Cypress Studio ==== */ - cy.get('.ant-statistic-content-value').should("contain.text", "total") - cy.get('.ant-statistic-content-value').should(($element) => { - // total 600M - // -rw-r--r-- 1 root root 45M Jun 26 22:17 sysom.tar.gz - // drwxr-xr-x 14 root root 4.0K Aug 2 11:36 sysom-2.2 - // -rw-r--r-- 1 root root 238M Aug 2 11:38 sysom-2.2.tar.gz - // drwxr-xr-x 14 root root 4.0K Aug 2 14:53 sysom-3.0 - // drwxr-xr-x 8 root root 4.0K Aug 2 14:55 rpmbuild - // -rw-r--r-- 1 root root 238M Aug 2 14:56 sysom-3.0.tar.gz - // drwxr-xr-x 14 root root 4.0K Sep 11 20:31 sysom - let lines = $element.text().split("\n") - - // 断言结果文本至少大于 1 行 - expect(lines.length).to.be.gt(1) - - // total 600M - let first_line = lines[0] - first_line = first_line.replace("total ", "") - - // 断言总文件大小至少大于0 - expect(parseInt(first_line)).to.be.gt(0) - }) - cy.get('.ant-pro-card-border > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Command Result") - /* ==== End Cypress Studio ==== */ - }) - }) -}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/diagnosis/diagnose_query.cy.js b/sysom_web/cypress/e2e/diagnosis/diagnose_query.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..a1b5428c658d62b3ac9a01a8cf8d9014dd6b7dc6 --- /dev/null +++ b/sysom_web/cypress/e2e/diagnosis/diagnose_query.cy.js @@ -0,0 +1,103 @@ +/* + * @Author: wb-msm241621 + * @Date: 2024-03-11 17:40:55 + * @LastEditTime: 2024-03-14 17:24:51 + * @Description: + */ +/// + +Cypress.on('uncaught:exception', (err, runnable) => { + // return false to prevent the error from failing the test if it matches + // the specific error message + if (err.message.includes('ResizeObserver loop completed with undelivered notifications')) { + return false; + } + // else let Cypress handle the exception as it normally does + return true; +}); + +describe("SysOM Diagnosis Test -- Query", () => { + beforeEach(() => { + // 自动登录 + cy.login() + }) + it("Invoke diagnose query, and check result", () => { + cy.intercept("GET", "/api/v1/tasks/?*").as("getDiagnoseTaskList") + + // 1. 访问诊断查询页面 + cy.visit("diagnose/query") + + cy.wait("@getDiagnoseTaskList", { timeout: 10000 }).then((interception) => { + expect(interception).to.have.property('response') + expect(interception.response?.body.code, 'code').to.equal(200) + expect(interception.response.statusCode).to.equal(200) + + // cy.get('.ant-table-tbody').find('tr').should("have.length.gte", 0) + cy.wait(1000) + cy.get('.ant-table-content').find('table').then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + // 断言告警列表数据内容大于等于1 + cy.wrap($el).find('.ant-table-tbody').find('tr').should('have.length.gte', 1) + // 断言表头字段数量是否等于5 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').should('have.length.gte', 5) + + // 断言诊断ID是否为UUID + cy.wrap($el).find(".ant-table-tbody").find('tr').eq(0).find('td').eq(1).invoke("text").then((text) => { + expect(text).to.match(/[a-zA-Z0-9]{8}/); + }) + + // 断言诊断时间格式 + cy.wrap($el).find(".ant-table-tbody").find('tr').eq(0).find('td').eq(2).invoke("text").then((text) => { + const data = new Date(text); + expect(data.toString()).not.to.equal('Invalid Date!') + }) + + // 断言诊断状态是否在枚举类型中 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(3).invoke("text").then((text) => { + expect(text.trim()).to.be.oneOf(["诊断完毕", "异常", "准备中", "运行中"]) + }) + + // 断言诊断参数是否大于1 + cy.wrap($el).find('.ant-table-tbody').find('tr').eq(0).find('td').eq(4).find("span").should('have.length.gt', 1) + } + }) + }) + + cy.wait(2000) + + // 2. 诊断查询参数悬着ssh通道 + cy.get('#channel').click({force: true}) + cy.get(".rc-virtual-list-holder-inner").contains("SSH通道").click() + + cy.get(':nth-child(2) > .ant-btn').click() + cy.wait(2000) + + // 3. + cy.get(".ant-table-content").find("table").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find(".ant-table-tbody").find('tr').eq(0).find('td').eq(5).find('.ant-space-item').eq(0).then(($el) => { + if ($el.text().includes("查看出错信息")) { + // 诊断失败后查看错误信息 + cy.wrap($el).get("a").contains("查看出错信息").click() + cy.wait(1000) + } else { + // 诊断成功后查看诊断结果 + cy.wrap($el).get("a").contains("查看诊断结果").click() + cy.wait(1000) + + cy.get(".ant-pro-card-header").eq(0).scrollIntoView() + cy.get(".ant-pro-card-header").should("contain.text", "诊断结果") + } + }) + } + }) + cy.wait(1500) + cy.scrollTo("top") + + cy.get('.ant-pro-card-body > .ant-form > .ant-space-align-center > :nth-child(1) > .ant-btn').click() + }) +}) diff --git a/sysom_web/cypress/e2e/diagnosis/filecache.cy.js b/sysom_web/cypress/e2e/diagnosis/filecache.cy.js index 8076c18af93591a8634a2198dbe185aa255f6a1b..659b6f5b1e7b985b3e06bba2a13bbb4e71ab67a0 100644 --- a/sysom_web/cypress/e2e/diagnosis/filecache.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/filecache.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- filecache", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], //"value": "", "type": "all" }, @@ -85,33 +85,30 @@ describe("SysOM Diagnosis Test -- filecache", () => { // cy.get('.ant-statistic-content-value').click() //cy.get('.ant-table-thead').should("contain.text", "请求时间") /* ==== End Cypress Studio ==== */ + cy.wait(1000) + cy.get('.ant-pro-card-title > div').scrollIntoView() + cy.get('.ant-pro-card-title > div').should('contain.text', '诊断结果') + cy.get('.ant-table-content').last().find('table').then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + const dataList = result.result?.podmem?.data + if (dataList && dataList.length > 0) { + const rowItem = dataList[0] + const rowNames = Object.keys(rowItem) + const thNames = rowNames.filter(item => item != "key") + + cy.get('table').last().find('thead').find('tr').find('th').should('have.length.gte', 6) + thNames.forEach((name, index) => { + cy.get('table').last().find('thead').find('tr').find('th').eq(index).should('contain.text', name) + }) + } + } + }) }) /* ==== Generated with Cypress Studio ==== */ - - cy.get('[style="padding: 0px;"] > .ant-pro-table > \ - .ant-pro-card > .ant-pro-card-body > .ant-table-wrapper > \ - .ant-spin-nested-loading > .ant-spin-container > .ant-table > \ - .ant-table-container > .ant-table-content > table > .ant-table-thead > \ - tr > :nth-child(1)').should("contain.text", "POD"); - - cy.get('[style="padding: 0px;"] > .ant-pro-table > .ant-pro-card > \ - .ant-pro-card-body > .ant-table-wrapper > .ant-spin-nested-loading > \ - .ant-spin-container > .ant-table > .ant-table-container > \ - .ant-table-content > table > .ant-table-thead > tr > :nth-child(2)') - .should("contain.text", "Container"); - cy.get('[style="padding: 0px;"] > .ant-pro-table > .ant-pro-card > \ - .ant-pro-card-body > .ant-table-wrapper > .ant-spin-nested-loading > \ - .ant-spin-container > .ant-table > .ant-table-container > \ - .ant-table-content > table > .ant-table-thead > tr > :nth-child(3)') - .should("contain.text", "Filename"); - - cy.get('[style="padding: 0px;"] > .ant-pro-table > .ant-pro-card > \ - .ant-pro-card-body > .ant-table-wrapper > .ant-spin-nested-loading > \ - .ant-spin-container > .ant-table > .ant-table-container > \ - .ant-table-content > table > .ant-table-thead > tr > :nth-child(5)') - .should("contain.text", "Cached"); /* ==== End Cypress Studio ==== */ }) @@ -122,7 +119,7 @@ describe("SysOM Diagnosis Test -- filecache", () => { // 诊断参数 { - "instance": "192.168.0.136", + "instance": Cypress.env("HOSTS")[0], //"value": "", "type": "host" }, @@ -187,19 +184,25 @@ describe("SysOM Diagnosis Test -- filecache", () => { // cy.get('.ant-statistic-content-value').click() //cy.get('.ant-table-thead').should("contain.text", "请求时间") /* ==== End Cypress Studio ==== */ - + cy.get('.ant-pro-card-title > div').scrollIntoView() + cy.get('.ant-pro-card-title > div').should('contain.text', '诊断结果') + cy.get('.ant-table-content').last().find('table').then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + const dataList = result.result?.podmem?.data + if (dataList && dataList.length > 0) { + const rowItem = dataList[0] + const rowNames = Object.keys(rowItem) + const thNames = rowNames.filter(item => item != "key") + + cy.get('table').last().find('thead').find('tr').find('th').should('have.length.gte', 6) + thNames.forEach((name, index) => { + cy.get('table').last().find('thead').find('tr').find('th').eq(index).should('contain.text', name) + }) + } + } + }) }) - /* ==== Generated with Cypress Studio ==== */ - cy.get('[style="padding: 0px;"] > .ant-pro-table > .ant-pro-card > \ - .ant-pro-card-body > .ant-table-wrapper > .ant-spin-nested-loading > \ - .ant-spin-container > .ant-table > .ant-table-container > \ - .ant-table-content > table > .ant-table-thead > tr > :nth-child(1)') - .should("contain.text", "Filename"); - cy.get('[style="padding: 0px;"] > .ant-pro-table > .ant-pro-card > \ - .ant-pro-card-body > .ant-table-wrapper > .ant-spin-nested-loading > \ - .ant-spin-container > .ant-table > .ant-table-container > \ - .ant-table-content > table > .ant-table-thead > tr > :nth-child(3)') - .should("contain.text", "Cached"); - /* ==== End Cypress Studio ==== */ }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/iofsstat_test.cy.js b/sysom_web/cypress/e2e/diagnosis/iofsstat_test.cy.js index c0462370553d64d74eae8fe9d22d9a93ced13ef2..cc3fbee5aabdfb784b7d6c3e852a804b629691ca 100644 --- a/sysom_web/cypress/e2e/diagnosis/iofsstat_test.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/iofsstat_test.cy.js @@ -13,7 +13,7 @@ describe("SysOM Diagnosis Test -- iofsstat", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], "timeout": "5", }, (result) => { diff --git a/sysom_web/cypress/e2e/diagnosis/iohang_test.cy.js b/sysom_web/cypress/e2e/diagnosis/iohang_test.cy.js index 28f955005afeaacfd7728c47291544c48a82a36c..f004950a47e059ef32770038ea6606a132e6986a 100644 --- a/sysom_web/cypress/e2e/diagnosis/iohang_test.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/iohang_test.cy.js @@ -13,7 +13,7 @@ describe("SysOM Diagnosis Test -- iohang", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], "threshold": "1", "timeout": "10", }, @@ -26,9 +26,11 @@ describe("SysOM Diagnosis Test -- iohang", () => { //////////////////////////////////////////////////////////// /* ==== Generated with Cypress Studio ==== */ // cy.get('.ant-statistic-content-value').click() - cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "IO HANG overview"); - cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /normal|abnormal/); - cy.get(':nth-child(2) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "More details of TOP 10 IO"); + cy.diagnosisTaskResultHandler(result, () => { + cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "IO HANG overview"); + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /normal|abnormal/); + cy.get(':nth-child(2) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "More details of TOP 10 IO"); + }) //cy.get('.ant-statistic-content-value').should("contain.text", "total") /* ==== End Cypress Studio ==== */ }) diff --git a/sysom_web/cypress/e2e/diagnosis/iolatency_test.cy.js b/sysom_web/cypress/e2e/diagnosis/iolatency_test.cy.js index 0f8499bfe2783bed8ae5fde5ef1e8d65e9548db3..f398d8c6a74e393684841ac42fc3a0820fe4f4d9 100644 --- a/sysom_web/cypress/e2e/diagnosis/iolatency_test.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/iolatency_test.cy.js @@ -13,7 +13,7 @@ describe("SysOM Diagnosis Test -- iolatency", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], "timeout": "5", "threshold": "1" }, @@ -27,11 +27,16 @@ describe("SysOM Diagnosis Test -- iolatency", () => { //////////////////////////////////////////////////////////// /* ==== Generated with Cypress Studio ==== */ // cy.get('.ant-statistic-content-value').click() - cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Iolatency overview"); - cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /normal|abnormal/); - cy.get(':nth-child(2) > .ant-pro-card-header').should("contain.text", "Overall delay distribution"); - cy.get(':nth-child(3) > .ant-pro-card-header').should("contain.text", "Single IO delay metrics display"); - cy.get(':nth-child(4) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "More details of TOP 10 IO"); + if (result.status == "Fail") { + cy.get('.ant-modal-confirm-title').should("contain.text", "诊断失败") + cy.get('.ant-modal-confirm-btns > .ant-btn > span').click() + } else { + cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Iolatency overview"); + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /normal|abnormal/); + cy.get(':nth-child(2) > .ant-pro-card-header').should("contain.text", "Overall delay distribution"); + cy.get(':nth-child(3) > .ant-pro-card-header').should("contain.text", "Single IO delay metrics display"); + cy.get(':nth-child(4) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "More details of TOP 10 IO"); + } //cy.get('.ant-statistic-content-value').should("contain.text", "total") /* ==== End Cypress Studio ==== */ }) diff --git a/sysom_web/cypress/e2e/diagnosis/jitter.cy.js b/sysom_web/cypress/e2e/diagnosis/jitter.cy.js index 39fbe2fc9bd56a48d05e51a73e05caad4e7e7d7c..e18cd06cf17e1464a7acd6580916823ca797b948 100644 --- a/sysom_web/cypress/e2e/diagnosis/jitter.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/jitter.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- jitter", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], }, // "result": { @@ -54,9 +54,11 @@ describe("SysOM Diagnosis Test -- jitter", () => { (result) => { // cy.get(':nth-child(1) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "发送端报文路径"); /* ==== Generated with Cypress Studio ==== */ - cy.get(':nth-child(1) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "发送端报文路径"); - cy.get(':nth-child(2) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "接收端报文路径"); - /* ==== End Cypress Studio ==== */ + if (result.status === "Success") { + cy.get(':nth-child(1) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "发送端报文路径"); + cy.get(':nth-child(2) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "接收端报文路径"); + /* ==== End Cypress Studio ==== */ + } }) }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/load_diagnosis_test.cy.js b/sysom_web/cypress/e2e/diagnosis/load_diagnosis_test.cy.js index c59110f3d6dee1c7580f1e2eedcd103555fce46f..b850b70620c7834eb1b03ed99810e6850f960c2e 100644 --- a/sysom_web/cypress/e2e/diagnosis/load_diagnosis_test.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/load_diagnosis_test.cy.js @@ -13,7 +13,7 @@ describe("SysOM Diagnosis Test -- loadtask", () => { // 诊断参数 { - "instance": "127.0.0.1" + "instance": Cypress.env("HOSTS")[0] }, // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) diff --git a/sysom_web/cypress/e2e/diagnosis/memgraph.cy.js b/sysom_web/cypress/e2e/diagnosis/memgraph.cy.js index 054f6919ef1d03fa894768dac243b44f7a1fcabb..ae849e167c4285892081178b28c360a98077cc7e 100644 --- a/sysom_web/cypress/e2e/diagnosis/memgraph.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/memgraph.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- memgraph", () => { // 诊断参数 { - "instance": "127.0.0.1" + "instance": Cypress.env("HOSTS")[0] }, // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) @@ -259,7 +259,11 @@ describe("SysOM Diagnosis Test -- memgraph", () => { // 断言文档:https://docs.cypress.io/guides/references/assertions#Text-Content //////////////////////////////////////////////////////////// /* ==== Generated with Cypress Studio ==== */ - cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should('match',/^[0-9]*%$/); + // cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should('match',/^[0-9]*%$/); + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.gt(0); + }) cy.get(':nth-child(2) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should('match',/NG|OK/); cy.get(':nth-child(3) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should('match',/NG|OK/); cy.get(':nth-child(4) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should('match',/NG|OK/); diff --git a/sysom_web/cypress/e2e/diagnosis/oomcheck.cy.js b/sysom_web/cypress/e2e/diagnosis/oomcheck.cy.js index 116d0fd13a4dbc383d8036c1e32eb096532a9534..c38a62a946d95281466ebeead7feba86c67ff46b 100644 --- a/sysom_web/cypress/e2e/diagnosis/oomcheck.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/oomcheck.cy.js @@ -13,7 +13,7 @@ describe("SysOM Diagnosis Test -- oomcheck", () => { // 诊断参数 { - "instance": "127.0.0.1" + "instance": Cypress.env("HOSTS")[0] }, // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) diff --git a/sysom_web/cypress/e2e/diagnosis/packetdrop.cy.js b/sysom_web/cypress/e2e/diagnosis/packetdrop.cy.js index 4bfd4545a790b3855fa302a8e375f4a5730373b6..710bac0b9669d544d0a08e298ec3b63eef503f82 100644 --- a/sysom_web/cypress/e2e/diagnosis/packetdrop.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/packetdrop.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- packetdrop", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], }, // result { @@ -49,8 +49,10 @@ describe("SysOM Diagnosis Test -- packetdrop", () => { // 断言文档:https://docs.cypress.io/guides/references/assertions#Text-Content //////////////////////////////////////////////////////////// /* ==== Generated with Cypress Studio ==== */ - cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-title').should("contain.text", "tcp"); - cy.get(':nth-child(3) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "丢包详情列表"); + cy.diagnosisTaskResultHandler(result, () => { + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-title').should("contain.text", "tcp"); + cy.get(':nth-child(3) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "丢包详情列表"); + }) /* ==== End Cypress Studio ==== */ }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/retran.cy.js b/sysom_web/cypress/e2e/diagnosis/retran.cy.js index 6f5d6cbc0194e4e5922f4f0b7a98f2ff9b05b0ed..c5a3ef36b981c11805703bf4dfee785087c644c0 100644 --- a/sysom_web/cypress/e2e/diagnosis/retran.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/retran.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- retran", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], }, @@ -124,12 +124,14 @@ describe("SysOM Diagnosis Test -- retran", () => { // cy.get('.ant-statistic-content-value').click() // cy.get(':nth-child(1) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "发送端报文路径"); /* ==== Generated with Cypress Studio ==== */ - cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "retran overview"); - cy.get(':nth-child(2) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(1) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "sourcePortDistribution"); - cy.get(':nth-child(2) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(2) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "destPortDistribution"); - cy.get(':nth-child(3) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(1) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "sourceIpDistribution"); - cy.get(':nth-child(3) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(2) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "destIpDistribution"); - cy.get(':nth-child(4) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "重传详情列表"); + cy.diagnosisTaskResultHandler(result, () => { + cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "retran overview"); + cy.get(':nth-child(2) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(1) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "sourcePortDistribution"); + cy.get(':nth-child(2) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(2) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "destPortDistribution"); + cy.get(':nth-child(3) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(1) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "sourceIpDistribution"); + cy.get(':nth-child(3) > [style="margin-right: -8px; margin-left: -8px;"] > :nth-child(2) > .ant-pro-card-border > [style="padding: 0px;"] > .ant-pro-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "destIpDistribution"); + cy.get(':nth-child(4) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "重传详情列表"); + }) /* ==== End Cypress Studio ==== */ }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/rtdelay.cy.js b/sysom_web/cypress/e2e/diagnosis/rtdelay.cy.js index af3f98d400aaae08293d60978f46bd3e21e56275..b7a1b5ea2af76d93fd14c845b28f1bb525395c6d 100644 --- a/sysom_web/cypress/e2e/diagnosis/rtdelay.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/rtdelay.cy.js @@ -12,7 +12,7 @@ describe("SysOM Diagnosis Test -- rtdelay", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], "pid": "-1", "time": "2" }, @@ -61,7 +61,10 @@ describe("SysOM Diagnosis Test -- rtdelay", () => { //////////////////////////////////////////////////////////// /* ==== Generated with Cypress Studio ==== */ // cy.get('.ant-statistic-content-value').click() - cy.get('.ant-table-thead').should("contain.text", "请求时间") + + cy.diagnosisTaskResultHandler(result, ()=> { + cy.get('.ant-table-thead').should("contain.text", "请求时间") + }) /* ==== End Cypress Studio ==== */ }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/schedmoni.cy.js b/sysom_web/cypress/e2e/diagnosis/schedmoni.cy.js index 9eb29356899688963c0f93d051e43b8b6602f262..a93c53eb7ea785f6b1a7b6ee2f290a90c2d4f01a 100644 --- a/sysom_web/cypress/e2e/diagnosis/schedmoni.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/schedmoni.cy.js @@ -12,19 +12,30 @@ describe("SysOM Cluster Manager Test", () => { // 诊断参数 { - "instance": "127.0.0.1", + "instance": Cypress.env("HOSTS")[0], "timeout": "1", - "threshold": "1" + "threshold": "1" }, // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) (result) => { - cy.get('.ant-pro-card-border.ant-pro-card-contain-card > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Event overview"); - cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); - cy.get(':nth-child(2) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); - cy.get(':nth-child(3) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); - cy.get(':nth-child(2) > .ant-pro-card-header').should("contain.text", "Timeline Diagram"); - cy.get(':nth-child(3) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Scheduling Jitter Details"); + cy.get('.ant-pro-card-title > div').scrollIntoView() + cy.get('.ant-pro-card-title > div').should('contain.text', '诊断结果') + cy.diagnosisTaskResultHandler(result, () => { + cy.get(':nth-child(1) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Event overview"); + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body').then(($el) => { + if ($el.text().includes("no data")){ + cy.wrap($el).contains("no data") + } else { + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); + cy.get(':nth-child(2) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); + cy.get(':nth-child(3) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value').invoke('text').should("match", /emergency|warning|normal/); + } + }) + + cy.get(':nth-child(2) > .ant-pro-card-header').should("contain.text", "Timeline Diagram"); + cy.get(':nth-child(3) > .ant-pro-card-header > .ant-pro-card-title').should("contain.text", "Scheduling Jitter Details"); + }) }) }) }) diff --git a/sysom_web/cypress/e2e/diagnosis/system_health_test.cy.js b/sysom_web/cypress/e2e/diagnosis/system_health_test.cy.js index 3f361d95fa6a2884623026051b3e65fb7ab1e90f..94191c0258669393859d035e326dfc974a5acfc6 100644 --- a/sysom_web/cypress/e2e/diagnosis/system_health_test.cy.js +++ b/sysom_web/cypress/e2e/diagnosis/system_health_test.cy.js @@ -9,7 +9,7 @@ describe("SysOM Diagnosis Test -- system health", () => { //cy.wait(5000) cy.sysomDiagnosisCheck( // 诊断前端url - "diagnose/oscheck", + "diagnose/ossre", // 诊断参数 { @@ -39,4 +39,4 @@ describe("SysOM Diagnosis Test -- system health", () => { /* ==== End Cypress Studio ==== */ }) }) -}) \ No newline at end of file +}) diff --git a/sysom_web/cypress/e2e/host/host.cy.js b/sysom_web/cypress/e2e/host/host.cy.js index f9b01b29c5f33d5b3be9bac221bf89c81890d3fd..73b50e2b87665ec7005d0b492c638227d68b6c82 100644 --- a/sysom_web/cypress/e2e/host/host.cy.js +++ b/sysom_web/cypress/e2e/host/host.cy.js @@ -5,6 +5,8 @@ describe("SysOM Host Manager Test", () => { cy.login() }) it("Crate and delete host", () => { + cy.intercept("GET", "/api/v1/host/") + .as("getHostList") cy.intercept("POST", "/api/v1/host/") .as("createHost") @@ -14,39 +16,45 @@ describe("SysOM Host Manager Test", () => { // 1. 访问主机列表也米娜 cy.visit("/host/list") - // 2. 点击新建主机打开模块框 - cy.get("button").contains("新建主机").click() - - // 3. 在模态框内部填充字段 - cy.get(".ant-modal-content").first().within(() => { - // 3.1 cluster - cy.get("#cluster").focus().type("default").type("{enter}") - - // 3.2 hostname - cy.get("#hostname").focus().clear().type("local") - - // 3.3 username - cy.get("#username").focus().clear().type("root") - - // 3.4 password - cy.get("#host_password").focus().clear().type("alios#123") - - // 3.5 ip - cy.get("#ip").focus().clear().type("127.0.0.1") - - // 3.6 port - cy.get("#port").focus().clear().type("22") - - // 3.7 确认 - cy.get("button").contains("确 认").click() + cy.wait('@getHostList', { timeout: 10000 }) + .then((interception) => { + expect(interception).to.have.property('response') + expect(interception.response?.body.code, 'code').to.equal(200) + expect(interception.response.statusCode).to.equal(200) + + const { data } = interception.response.body + const ipList = data.map((item) => { return item.ip }) + const defaultHostIpOne = Cypress.env("HOSTS")[0] + + // 判断主机列表是否已存在默认测试主机 + if (ipList.includes(defaultHostIpOne)) { + + // 找到默认的主机,并点击删除按钮 + cy.get("td") + .contains(defaultHostIpOne) + .parent() + .within(() => { + cy.get("td").contains("删除").click() + }) + + // 点击删除按钮之后需要在弹出的浮窗中点击OK确认 + cy.get(".ant-popover-buttons").find("button").contains("OK").click() + // 确认删除接口调用结果为 200 + cy.wait('@deleteHost') + .then((interception) => { + cy.wrap({ + statusCode: interception.response?.statusCode + }).its("statusCode").should("eq", 200) + }) + } + }) - // 3.8 等待新建主机请求结束,判断请求是否成功 - // 检查状态码返回是否是200(如果集群已经存在会返回400) - cy.wait('@createHost').its("response.statusCode").should("eq", 200) - }) + // 2. 点击新建主机并开始添加 + cy.addDefaultHost() // 创建完主机后等待一秒钟,一秒钟后执行删除操作 cy.wait(1000) + // 找到最新创建的主机,并点击删除按钮 cy.get("td") @@ -55,10 +63,9 @@ describe("SysOM Host Manager Test", () => { .within(() => { cy.get("td").contains("删除").click() }) - + // 点击删除按钮之后需要在弹出的浮窗中点击OK确认 cy.get(".ant-popover-buttons").find("button").contains("OK").click() - // 确认删除接口调用结果为 200 cy.wait('@deleteHost') .then((interception) => { @@ -66,5 +73,6 @@ describe("SysOM Host Manager Test", () => { statusCode: interception.response?.statusCode }).its("statusCode").should("eq", 200) }) + }) }) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/hotfix/configurekernel.cy.js b/sysom_web/cypress/e2e/hotfix/configurekernel.cy.js index 1f45d699d347c612eed3104c9bf31ceeeac2c4cf..664e3064bba7af1c410c0e422dc9467fbb411c0f 100644 --- a/sysom_web/cypress/e2e/hotfix/configurekernel.cy.js +++ b/sysom_web/cypress/e2e/hotfix/configurekernel.cy.js @@ -1,4 +1,8 @@ /* ==== Test Created with Cypress Studio ==== */ +/** + * This js is use for automaticly test the function + * of kernel version and os type configuration test. + */ it('ostest', function() { cy.login() /** test for OSType configure */ @@ -23,12 +27,13 @@ it('ostest', function() { cy.get(':nth-child(1) > .ant-pro-table-list-toolbar-setting-item > :nth-child(1) > .anticon > svg').click(); /** change information of an OSType */ cy.get('table').within(() => { - cy.get('tr').eq(1).contains('修改').click({force: true}) - }) - cy.get(".ant-modal-content").first().within(() => { - cy.get("#os_type_name").focus().clear().type("alinux3_x86").type("{enter}") - cy.get("button").contains("确 认").click() + cy.get('tr').eq(1).contains('编辑').click({force: true}) }) + cy.get('tr:first-child td:first-child input').clear().type('x86'); + cy.get('table').within(() => { + cy.get('tr').eq(1).contains('保存').click({force: true}) + }) + //cy.get('.ant-typography').contains("保存").click() /* test for Kernel Version Page */ cy.visit('/hotfix/version/customize'); cy.get('#os_type').click(); @@ -41,20 +46,21 @@ it('ostest', function() { cy.get('#devel_link').type('c'); cy.get('#debuginfo_link').clear('d'); cy.get('#debuginfo_link').type('d'); - cy.get(':nth-child(6) > .ant-btn').click(); - /* fix the record of a kernel version */ + cy.get('#image').type('image') + cy.get(':nth-child(8) > .ant-btn').click(); + /* fix the record of a kernel version */ cy.wait(1000) cy.get('table').within(() => { - cy.get('tr').eq(1).contains('修改').click({force: true}) - }) - cy.get(".ant-modal-content").first().within(() => { - cy.get("#kernel_version").focus().clear().type("cccc").type("{enter}") - cy.get("button").contains("确 认").click() + cy.get('tr').eq(1).contains('编辑').click({force: true}) }) - /** delete this record */ + cy.get('tr:first-child td:first-child input').clear().type('ccccc'); + cy.get('table').within(() => { + cy.get('tr').eq(1).contains('保存').click({force: true}) + }) + /** delete this record */ cy.get('table').within(() => { cy.get('tr').eq(1).contains('删除').click({force: true}) }) cy.get('.ant-popover-buttons').contains('OK').click(); }); - \ No newline at end of file + diff --git a/sysom_web/cypress/e2e/hotfix/formal.cy.js b/sysom_web/cypress/e2e/hotfix/formal.cy.js index c72ffaeb28592b39764c02faf601bdb4f8b1da16..2c0c02f982acadc77a551d054a669dd8465aa3b6 100644 --- a/sysom_web/cypress/e2e/hotfix/formal.cy.js +++ b/sysom_web/cypress/e2e/hotfix/formal.cy.js @@ -1,7 +1,9 @@ /* ==== Test Created with Cypress Studio ==== */ -/**Before this test case begins, there should be one successfully built. -And this hotfix should be changed into formal, so that there is a record -for formal hotfix test */ + +/** + * This is a one singal formal hotfix list test. + * This test dont need any successfully build. + */ it('formal hotfix test', function() { cy.login() @@ -12,8 +14,9 @@ it('formal hotfix test', function() { cy.get(':nth-child(1) > .ant-btn > span').click({multiple: true}); cy.get('#hotfix_name').clear('te'); cy.get('#hotfix_name').type('test'); + cy.get('#patch_file').type('patch_file_0001.patch'); + cy.get('#rc_select_1').type('5.10.112-11.1.al8.x86_64'); cy.get(':nth-child(2) > .ant-btn > span').click(); - cy.get('.ant-btn > .anticon > svg').click(); - cy.get('.ant-space > :nth-child(2) > span > a').click(); + /* ==== End Cypress Studio ==== */ }); diff --git a/sysom_web/cypress/e2e/hotfix/formallist.cy.js b/sysom_web/cypress/e2e/hotfix/formallist.cy.js deleted file mode 100644 index c2c027c3036bda93c7910c78ea0cdc5de7f4f349..0000000000000000000000000000000000000000 --- a/sysom_web/cypress/e2e/hotfix/formallist.cy.js +++ /dev/null @@ -1,19 +0,0 @@ -/* ==== Test Created with Cypress Studio ==== */ -/**Before this test case begins, there should be one successfully built. -And this hotfix should be changed into formal, so that there is a record -for formal hotfix test */ - -it('formal hotfix test', function() { - cy.login() - /* ==== Generated with Cypress Studio ==== */ - cy.visit('/hotfix/formal_hotfix'); - cy.get('#created_at').click(); - cy.get('.ant-picker-today-btn').click(); - cy.get(':nth-child(1) > .ant-btn > span').click({multiple: true}); - cy.get('#hotfix_name').clear('te'); - cy.get('#hotfix_name').type('test'); - cy.get(':nth-child(2) > .ant-btn > span').click(); - cy.get('.ant-btn > .anticon > svg').click(); - cy.get('.ant-space > :nth-child(2) > span > a').click(); - /* ==== End Cypress Studio ==== */ - }); \ No newline at end of file diff --git a/sysom_web/cypress/e2e/hotfix/formalproduction.cy.js b/sysom_web/cypress/e2e/hotfix/formalproduction.cy.js index 1f782c85688c4cc23064b0e2e494353c46d31d93..fa0c09cec46e3a568a65ab3c6b8c4e92ba981a6e 100644 --- a/sysom_web/cypress/e2e/hotfix/formalproduction.cy.js +++ b/sysom_web/cypress/e2e/hotfix/formalproduction.cy.js @@ -1,118 +1,135 @@ /// -describe("SysOM Host Manager Test", () => { - - it("login success", () => { - cy.login() - - cy.intercept("POST", "api/v1/hotfix/upload_patch/") - .as("upload_patch") - - - cy.intercept("POST", "api/v1/hotfix/create_hotfix/") - .as("create") - - cy.intercept("GET", "api/v1/hotfix/get_hotfix_list/?*") - .as("gethotfixtasks") - - cy.intercept("POST", "api/v1/hotfix/set_formal/") - .as("set_formal") - - cy.intercept("DELETE", "api/v1/hotfix/delete_hotfix/") - .as("delete_hotfix") - - const getAndCheckTaskResult = (task_id) => { - // 1.点击一下刷新按钮 - cy.get('span[aria-label="reload"]').click() - cy.wait('@gethotfixtasks').its("response.statusCode").should("eq", 200) - cy.wait(100) - cy.get(".ant-table-tbody > tr:nth-child(1)").then($el => { - let current_text = $el.text() - if (current_text.indexOf("等待构建") != -1 || current_text.indexOf("正在构建") != -1) { - //2.诊断运行中,等待一分钟后再次检查 - cy.wait(1000*60) - getAndCheckTaskResult(task_id) - } else { - if (current_text.indexOf("构建成功") != -1) { - - cy.get(".ant-table-tbody > tr:nth-child(1) > td:nth-child(8)").click() - - //3.转正式包的确认 - cy.get(".ant-popover-content").first().within(() => { - cy.get("div.ant-popover-buttons > button.ant-btn.ant-btn-primary.ant-btn-sm").first().click() - }) - - //4.转正式包接口结果判断 - cy.wait('@set_formal') - .then((interception) => { - cy.wrap({ - statusCode: interception.response?.statusCode - }).its("statusCode").should("eq", 200) - }) - - - //5.点击下载按钮 - cy.get(".ant-table-tbody > tr:nth-child(1) > td:nth-child(9) > div").click() - - - //6.点击第一条数据删除 - cy.get('table').within(() => { - cy.get('tr').eq(1).contains("删除").click(); - }) - - //7.点击确认删除 - cy.get(".ant-popover-buttons > button.ant-btn.ant-btn-primary.ant-btn-sm").last().click() - - - //8.删除接口结果判断 - cy.wait('@delete_hotfix') - .then((interception) => { - cy.wrap({ - statusCode: interception.response?.statusCode - }).its("statusCode").should("eq", 200) - }) - } - - } - }) - } - - // 1. 访问主机列表列表 - cy.visit("/hotfix/make") - - //2. 点击安全中心打开模块框 - cy.get('#kernel_version').type('5.10.112-11.1.al8.x86_64'); - cy.get('#hotfix_name').type('test'); - - //3.上传文件 - cy.get("#patch").selectFile('cypress/e2e/hotfix/5.10-new-globals.patch',{force: true}) - - - //4.文件结果判断 - cy.wait('@upload_patch') - .then((interception) => { - cy.wrap({ - statusCode: interception.response?.statusCode - }).its("statusCode").should("eq", 200) - }) - - //5.点击创建 - cy.get("#root > div > section > div > main > div > div.ant-pro-grid-content > div > div > div.ant-pro-page-container-children-content > div > div.ant-pro-card.ant-pro-table-search.ant-pro-table-search-query-filter > form > div > div.ant-col.ant-col-8.ant-col-offset-16 > div > div > div.ant-col.ant-form-item-control > div > div > div > div > div > div:nth-child(2) > button > span").click() - - //6.创建任务触发构建判断 - cy.wait('@create') - .then((interception) => { - cy.wrap({ - statusCode: interception.response?.statusCode - }).its("statusCode").should("eq", 200) - expect(interception.response.body.data).to.have.property("id") - - // 得到构建 ID - let id = interception.response?.body.data.id - getAndCheckTaskResult(id) - - }) - - }) - -}) \ No newline at end of file +/** + * This test is used for hotfix build test. + * It test hotfix build, convert to formal + * wait, refresh state function. + * Once the hotfix build success, change to + * formal hotfix list for futher testing. + */ +it("login success", function() { + cy.login() + + cy.intercept("POST", "api/v1/hotfix/upload_patch/") + .as("upload_patch") + + + cy.intercept("POST", "api/v1/hotfix/create_hotfix/") + .as("create") + + cy.intercept("GET", "api/v1/hotfix/get_hotfix_list/?*") + .as("gethotfixtasks") + + cy.intercept("POST", "api/v1/hotfix/set_formal/") + .as("set_formal") + + cy.intercept("DELETE", "api/v1/hotfix/delete_hotfix/") + .as("delete_hotfix") + + const getAndCheckTaskResult = (task_id) => { + // 1.点击一下刷新按钮 + cy.get('span[aria-label="reload"]').click() + cy.wait('@gethotfixtasks').its("response.statusCode").should("eq", 200) + cy.wait(100) + cy.get(".ant-table-tbody > tr:nth-child(1)").then($el => { + let current_text = $el.text() + if (current_text.indexOf("等待构建") != -1 || current_text.indexOf("正在构建") != -1) { + //2.诊断运行中,等待一分钟后再次检查 + cy.wait(1000*60) + getAndCheckTaskResult(task_id) + } else { + if (current_text.indexOf("构建成功") != -1) { + + cy.get(".ant-table-tbody > tr:nth-child(1) > td:nth-child(8)").click() + + //3.转正式包的确认 + cy.get(".ant-popover-content").first().within(() => { + cy.get("div.ant-popover-buttons > button.ant-btn.ant-btn-primary.ant-btn-sm").first().click() + }) + + //4.转正式包接口结果判断 + cy.wait('@set_formal') + .then((interception) => { + cy.wrap({ + statusCode: interception.response?.statusCode + }).its("statusCode").should("eq", 200) + }) + + + //5.点击下载按钮 + cy.get(".ant-table-tbody > tr:nth-child(1) > td:nth-child(9) > div").click() + + // 7. 构建任务成功以后,前往正式热补丁列表执行操作 + cy.visit('/hotfix/formal_hotfix'); + cy.get('#created_at').click(); + cy.get('.ant-picker-today-btn').click(); + //cy.get(':nth-child(1) > .ant-btn > span').click({multiple: true}); + cy.get('#hotfix_name').clear(); + cy.get('#hotfix_name').type('test'); + cy.get(':nth-child(2) > .ant-btn > span').click(); + cy.get('.ant-btn > .anticon > svg').click({multiple: true}); + cy.get('.ant-space > :nth-child(2) > span > a').click(); + + // 访问热补丁制作页面 + cy.visit("/hotfix/make") + + + //6.点击第一条数据删除 + cy.get('table').within(() => { + cy.get('tr').eq(1).contains("删除").click(); + }) + + //7.点击确认删除 + cy.get(".ant-popover-buttons > button.ant-btn.ant-btn-primary.ant-btn-sm").last().click() + + + //8.删除接口结果判断 + cy.wait('@delete_hotfix') + .then((interception) => { + cy.wrap({ + statusCode: interception.response?.statusCode + }).its("statusCode").should("eq", 200) + }) + } + + } + }) + } + + // 1. 访问热补丁中心构建页面 + cy.visit("/hotfix/make") + + //2. 填入参数 + cy.get('#kernel_version').type('5.10.112-11.1.al8.x86_64'); + cy.get('#hotfix_name').type('test'); + + //3.上传文件 + cy.get("#patch").selectFile('cypress/e2e/hotfix/5.10-new-globals.patch',{force: true}) + + + //4.文件结果判断 + cy.wait('@upload_patch') + .then((interception) => { + cy.wrap({ + statusCode: interception.response?.statusCode + }).its("statusCode").should("eq", 200) + }) + + //5.点击创建 + cy.get("#root > div > section > div > main > div > div.ant-pro-grid-content > div > div > div.ant-pro-page-container-children-content > div > div.ant-pro-card.ant-pro-table-search.ant-pro-table-search-query-filter > form > div > div.ant-col.ant-col-8.ant-col-offset-16 > div > div > div.ant-col.ant-form-item-control > div > div > div > div > div > div:nth-child(2) > button > span").click() + + //6.创建任务触发构建判断 + cy.wait('@create') + .then((interception) => { + cy.wrap({ + statusCode: interception.response?.statusCode + }).its("statusCode").should("eq", 200) + expect(interception.response.body.data).to.have.property("id") + + // 得到构建 ID + let id = interception.response?.body.data.id + getAndCheckTaskResult(id) + + }) + +}) diff --git a/sysom_web/cypress/e2e/log/audit_log.cy.js b/sysom_web/cypress/e2e/log/audit_log.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..84e6002e621a0f5c5f0dd540947b6e6a71463c23 --- /dev/null +++ b/sysom_web/cypress/e2e/log/audit_log.cy.js @@ -0,0 +1,16 @@ +/// + + +describe("SysOM Log Manager Test", () => { + beforeEach(() => { + cy.login() + }) + it.only("select audit log and filter audit log", () => { + cy.sysomLogSelectOrFilter("journal/audit", { + ip: '127.0.0.1', + path: '/api/v1/host/', + methond: 'GET', + request_type: 'operate' + }, true) + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/log/node_log.cy.js b/sysom_web/cypress/e2e/log/node_log.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..094814f2daee43197fb816e1b0120308574880dc --- /dev/null +++ b/sysom_web/cypress/e2e/log/node_log.cy.js @@ -0,0 +1,11 @@ +/// + + +describe("SysOM Log Manager Test", () => { + beforeEach(() => { + cy.login() + }) + it.only("select node log and filter node log", () => { + cy.sysomLogSelectOrFilter("journal/node", { instance: '192.168.0.137' }, false) + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/log/task_log.cy.js b/sysom_web/cypress/e2e/log/task_log.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..606a2e734dc8aacb56dcb7cd92f21f645ed16517 --- /dev/null +++ b/sysom_web/cypress/e2e/log/task_log.cy.js @@ -0,0 +1,14 @@ +/// + + +describe("SysOM Log Manager Test", () => { + beforeEach(() => { + cy.login() + }) + it.only("select task log and filter task log", () => { + cy.sysomLogSelectOrFilter("journal/task", { + task_id: 'YqS7Lr9P', + }, + false) + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/monitor/MySQL_observer.cy.js b/sysom_web/cypress/e2e/monitor/MySQL_observer.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..a056945b371bc67110412e85d79c317c705fa0a7 --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/MySQL_observer.cy.js @@ -0,0 +1,149 @@ +/// + +describe("SysOM MySQL Observer Dashboard Test", () => { + beforeEach(() => { + cy.login(); + }) + + it("MySQL Observer test", () => { + // 1. 访问集群监控页面 + cy.visit("app_observable/mysql"); + + // 2. 等待页面加载完成 + cy.wait(2000); + + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL Error Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL Slow_Sql Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL Net_Drops Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL OOM Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL RT Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL Sched_Delay Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("MySQL Long_Time_D Alarm"); + cy.getPannelContentByTitle("异常告警分布(次数)").contains("Mysql CPU_High Alarm"); + + cy.getPannelContentByTitle("mySQL 连接线程池使用").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + + cy.getPannelContentByTitle("MySQL CPU占用率").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL CPU占用率").find("tbody tr").eq(0).find("td").eq(0).contains("user"); + cy.getPannelContentByTitle("MySQL CPU占用率").find("tbody tr").eq(1).find("td").eq(0).contains("sys"); + cy.getPannelContentByTitle("MySQL CPU占用率").find("tbody tr").eq(2).find("td").eq(0).contains("total"); + + cy.getPannelContentByTitle("MySQL CPU让出率").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL CPU让出率").find("tbody tr").eq(0).find("td").eq(0).contains("让出率"); + + + + cy.getPannelContentByTitle("MySQL Undolog链表长度&长事务").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + cy.getPannelContentByTitle("MySQL 内存缓存池使用").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + cy.getPannelContentByTitle("MySQL OS内存使用分布").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + cy.getPannelContentByTitle("MySQL Redolog使用量").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + cy.getPannelContentByTitle("MySQL RT").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL RT").find("tbody tr").eq(0).find("td").eq(0).contains("Avg"); + + cy.getPannelContentByTitle("MySQL请求详情").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + + cy.getPannelContentByTitle("MySQL 等待IO资源延迟(平均每秒)").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL 等待IO资源延迟(平均每秒)").find("tbody tr").eq(0).find("td").eq(0).contains("iowait"); + + cy.getPannelContentByTitle("MySQL申请OS内存延迟").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL申请OS内存延迟").find("tbody tr").eq(0).find("td").eq(0).contains("memAllocDelay"); + + cy.getPannelContentByTitle("MySQL OS调度延迟").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL OS调度延迟").find("tbody tr").eq(0).find("td").eq(0).contains("调度延迟"); + + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(0).find("td").eq(0).contains("MySQL.Total_delay"); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(1).find("td").eq(0).contains("MySQL.Disk_delay"); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(2).find("td").eq(0).contains("MySQL.OS_delay_by_io_block"); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(3).find("td").eq(0).contains("MySQL.OS_delay_by_disk_driver"); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(4).find("td").eq(0).contains("MySQL.OS_delay_by_io_complete"); + cy.getPannelContentByTitle("MySQL数据IO处理延迟分布").find("tbody tr").eq(5).find("td").eq(0).contains("MySQL.OS_delay_by_io_done"); + } + }) + + cy.getPannelContentByTitle("MySQL 磁盘各队列级IO延迟分布").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.getPannelContentByTitle("MySQL 磁盘各队列级IO延迟分布").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL 磁盘各队列级IO延迟分布").find("tbody tr").eq(0).find("td").eq(0).contains(/[a-z]+\.Qid[0-9]+\.Total_delay/); + cy.getPannelContentByTitle("MySQL 磁盘各队列级IO延迟分布").find("tbody tr").eq(1).find("td").eq(0).contains(/[a-z]+\.Qid[0-9]+\.Disk_delay/); + cy.getPannelContentByTitle("MySQL 磁盘各队列级IO延迟分布").find("tbody tr").eq(2).find("td").eq(0).contains(/[a-z]+\.Qid[0-9]+\.OS_delay/); + } + }) + + cy.getPannelContentByTitle("MySQL 磁盘级IO延迟分布").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.getPannelContentByTitle("MySQL 磁盘级IO延迟分布").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL 磁盘级IO延迟分布").find("tbody tr").eq(0).find("td").eq(0).contains(/[a-z]+\.Total_delay/); + cy.getPannelContentByTitle("MySQL 磁盘级IO延迟分布").find("tbody tr").eq(1).find("td").eq(0).contains(/[a-z]+\.Disk_delay/); + cy.getPannelContentByTitle("MySQL 磁盘级IO延迟分布").find("tbody tr").eq(2).find("td").eq(0).contains(/[a-z]+\.OS_delay/); + } + }) + + cy.getPannelContentByTitle("MySQL每CPU生产IO请求热力分布").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + } + }) + + cy.getPannelContentByTitle("MySQL每CPU处理IO中断热力分布").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + } + }) + + cy.getPannelContentByTitle("MySQL IO吞吐").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL IO吞吐").find("tbody tr").eq(0).find("td").eq(0).contains("rBPS"); + cy.getPannelContentByTitle("MySQL IO吞吐").find("tbody tr").eq(1).find("td").eq(0).contains("wBPS"); + + cy.getPannelContentByTitle("MySQL OS脏页量").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL OS脏页量").find("tbody tr").eq(0).find("td").eq(0).contains("Dirty Pages"); + cy.getPannelContentByTitle("MySQL OS脏页量").find("tbody tr").eq(1).find("td").eq(0).contains("Dirty Thresh"); + + cy.getPannelContentByTitle("MySQL网络吞吐").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL网络吞吐").find("tbody tr").eq(0).find("td").eq(0).contains("netRecTraffic"); + cy.getPannelContentByTitle("MySQL网络吞吐").find("tbody tr").eq(1).find("td").eq(0).contains("netSendTraffic"); + + cy.getPannelContentByTitle("MySQL请求数").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("MySQL请求数").find("tbody tr").eq(0).find("td").eq(0).contains("mysql requestCnt"); + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/monitor/cluster_monitor.cy.js b/sysom_web/cypress/e2e/monitor/cluster_monitor.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..fc67781e494307502a27e5e5aa5a024b77e2d700 --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/cluster_monitor.cy.js @@ -0,0 +1,134 @@ +/// + +describe("SysOM Cluster Monitor Dashboard Test", () => { + beforeEach(() => { + cy.login(); + }) + + it("Cluster monitor test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/cluster_monitor"); + + // 2. 等待页面加载完成 + cy.wait(2000); + + // assert health score below 100 + cy.getPannelContentByTitle("Cluster Health").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Errors Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Latency Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Load(Traffic) Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0) + }) + } + }) + + cy.getPannelContentByTitle("集群总CPU核数/节点数").contains("cores"); + cy.getPannelContentByTitle("集群总CPU核数/节点数").contains("nodes"); + + cy.getPannelContentByTitle("集群CPU利用率分布").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("集群内存使用分布").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("集群内存总量").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.least(0); + }) + + cy.getPannelContentByTitle("节点内存延时诊断").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + + cy.getPannelContentByTitle("容器内存延时诊断").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + + } + }) + + cy.getPannelContentByTitle("节点内存使用率诊断").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("节点CPU利用率诊断").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("节点CPU延时诊断").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("集群平均 CPU利用率").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("集群平均内存使用率").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").eq(0).find("td").eq(0).contains("user"); + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").eq(1).find("td").eq(0).contains("nice"); + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").eq(2).find("td").eq(0).contains("sys"); + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").eq(3).find("td").eq(0).contains("softirq"); + cy.getPannelContentByTitle("集群平均CPU利用率分布情况").find("tbody tr").eq(4).find("td").eq(0).contains("iowait"); + + cy.getPannelContentByTitle("总内存使用情况").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("总内存使用情况").find("tbody tr").eq(0).find("td").eq(0).contains("free"); + cy.getPannelContentByTitle("总内存使用情况").find("tbody tr").eq(1).find("td").eq(0).contains("used"); + cy.getPannelContentByTitle("总内存使用情况").find("tbody tr").eq(2).find("td").eq(0).contains("cache+buffer"); + cy.getPannelContentByTitle("总内存使用情况").find("tbody tr").eq(3).find("td").eq(0).contains("total"); + + cy.getPannelContentByTitle("集群平均调度延迟").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("集群平均调度延迟").find("tbody tr").eq(0).find("td").eq(0).contains("调度延迟"); + + cy.getPannelContentByTitle("集群用户态内存使用情况").find("ul li").should("have.length.gte", 1); + + cy.getPannelContentByTitle("集群节点平均load1").find("tbody tr").should("have.length.gte", 1); + + cy.getPannelContentByTitle("集群内核态内存使用情况").find("ul li").should("have.length.gte", 1); + + cy.getPannelContentByTitle("集群节点任务统计信息").find("tbody tr").should("have.length.gte", 1); + cy.getPannelContentByTitle("集群节点任务统计信息").find("tbody tr").eq(0).find("td").eq(0).contains("nr_forks"); + cy.getPannelContentByTitle("集群节点任务统计信息").find("tbody tr").eq(1).find("td").eq(0).contains("nr_blocked"); + + cy.getPannelContentByTitle("集群app与kernel内存对比").find("ul li").should("have.length.gte", 1); + + cy.getPannelContentByTitle("容器大盘").contains("容器大盘详情"); + + cy.getPannelContentByTitle("节点大盘").contains("节点大盘详情"); + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/monitor/container_monitor.cy.js b/sysom_web/cypress/e2e/monitor/container_monitor.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..d2102a0c24fc471005d6863ff47eebb790ecee1f --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/container_monitor.cy.js @@ -0,0 +1,158 @@ +/// + +describe("SysOM Container Monitor Dashboard Test", () => { + beforeEach(() => { + cy.login(); + }) + + it("Container Monitor", () => { + cy.wait(2000) + + // 1. 访问集群监控页面 + cy.visit("/monitor/container_monitor"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + /* + * Pod Health + */ + + // assert health score below 100 + cy.getPannelContentByTitle("Pod Health").then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Errors Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Latency Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0); + }) + } + }) + + cy.getPannelContentByTitle("Load(Traffic) Health").then(($el) => { + if ($el.text().includes("N/A")) { + cy.wrap($el).contains("N/A"); + } else { + cy.wrap($el).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.lte(100); + expect(num).to.be.gte(0) + }) + } + }) + + /* + * Pod Memory Monitor + */ + cy.openMainLabel("Pod Memory Monitor") + cy.openMainLabel("Pod Memory Monitor") + + // 当前面板Pod Cache Usage + cy.panelAtagValueGteOrNoDataTest("Pod Memory Usage (top 5)") + + // 当前面板Pod Cache Usage + cy.panelAtagValueGteOrNoDataTest("Pod Cache Usage (top 5)") + + // 当前面板Pod Mem Stat + cy.panelAtagValueGteOrNoDataTest("Pod Mem Stat") + + // 当前面板Pod Cached File + cy.panelAtagValueGteOrNoDataTest("Pod Cached File (top 5)") + + // 当前面板Pod Mem Event + cy.panelAtagValueGteOrNoDataTest("Pod Mem Event") + + // 当前面板Pod Memory Rate + cy.panelAtagValueGteOrNoDataTest("Memory Rate") + + // 当前面板Memory Global Direct Reclaim Latency + cy.panelAtagValueGteOrNoDataTest("Memory Global Direct Reclaim Latency") + + // 当前面板Memory Direct Reclaim Latency + cy.panelAtagValueGteOrNoDataTest("Memory Direct Reclaim Latency") + + // 当前面板Memory Compact Latency + cy.panelAtagValueGteOrNoDataTest("Memory Compact Latency") + + /* + * Pod CPU Monitor + */ + + cy.openMainLabel("Pod CPU Monitor") + cy.openMainLabel("Pod CPU Monitor") + + // 当前面板Pod CPU Usage + cy.panelAtagValueGteOrNoDataTest("Pod CPU Usage") + + // 当前面板Pod CPU nr_throttled + cy.panelAtagValueGteOrNoDataTest("Pod CPU nr_throttled") + + // 当前面板Pod wait_latency + cy.panelAtagValueGteOrNoDataTest("Pod wait_latency") + + // 当前面板Pod cfs_quota + cy.panelAtagValueGteOrNoDataTest("Pod cfs_quota") + + /* + * Pod Network Monitor + */ + + cy.openMainLabel("Pod Network Monitor") + cy.openMainLabel("Pod Network Monitor") + + // 当前面板Pod Network Traffic by Bytes + cy.panelAtagValueGteOrNoDataTest("Pod Network Traffic by Bytes") + + // 当前面板Pod Network Traffic by Packets + cy.panelAtagValueGteOrNoDataTest("Pod Network Traffic by Packets") + + // 当前面板Pod Network Traffic Drop + cy.panelAtagValueGteOrNoDataTest("Pod Network Traffic Drop") + + /* + * Pod IO Monitor + */ + + cy.openMainLabel("Pod IO Monitor") + cy.openMainLabel("Pod IO Monitor") + + // 当前面板Pod Writes/Reads Bytes Rates + cy.panelAtagValueGteOrNoDataTest("Pod Writes/Reads Bytes Rates"); + + // 当前面板Pod Writes/Reads IOs Rates + cy.panelAtagValueGteOrNoDataTest("Pod Writes/Reads IOs Rates"); + + // 当前面板Pod IO Queued + cy.panelAtagValueGteOrNoDataTest("Pod IO Queued"); + + // 当前面板Pod IO Wait Time + cy.panelAtagValueGteOrNoDataTest("Pod IO Wait Time"); + }) + +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/monitor/dashboard.cy.js b/sysom_web/cypress/e2e/monitor/dashboard.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..4500c60d4da65bd37c048cd920f9a8c7e9b26760 --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/dashboard.cy.js @@ -0,0 +1,690 @@ +/* + * @Author: wb-msm241621 + * @Date: 2023-12-20 17:15:53 + * @LastEditTime: 2023-12-22 15:54:55 + * @Description: + */ +/// + + +describe("SysOM dashboard Monitor Dashboard Test", () => { + beforeEach(() => { + cy.login(); + }) + + it("Dashboard monitor test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + /* Quick CPU / Mem / Disk */ + // 当前面板 CPU Busy + cy.panelNumericalValueGteTest("CPU Busy") + + // 当前面板Sys Load (5m avg) + cy.panelNumericalValueGteTest("Sys Load (5m avg)") + + // 当前面板Sys Load (15m avg) + cy.panelNumericalValueGteTest("Sys Load (15m avg)") + + // 当前面板RAM Used + cy.panelNumericalValueGteTest("RAM Used") + + // 当前面板SWAP Used + cy.panelNumericalValueGteTest("SWAP Used") + + // 当前面板Root FS Used + cy.panelNumericalValueGteTest("Root FS Used") + + // 当前面板 CPU Cores + cy.panelNumericalValueGteTest("CPU Cores") + + // 当前面板 Uptime + cy.panelNumericalValueGteTest("Uptime") + + // 当前面板 RootFS Total + cy.panelNumericalValueGteTest("RootFS Total") + + // 当前面板 RAM Total + cy.panelNumericalValueGteTest("RAM Total") + + // 当前面板 SWAP Total + cy.panelNumericalValueGteTest("SWAP Total") + }) + + it("Basic CPU / Mem / Net / Disk Test", () => { + /* + * Basic CPU / Mem / Net / Disk + */ + + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 当前面板 CPU Basic + cy.panelAtagValueGteTest("CPU Basic") + + // 当前面板Memory Basic + cy.panelAtagValueGteTest("Memory Basic") + + // 当前面板 Network Traffic Basic + cy.panelAtagValueGteTest("Network Traffic Basic") + + // 当前面板 Disk Space Used Basic + cy.panelAtagValueGteTest("Disk Space Used Basic") + }) + + it("CPU / Memory / Net / Disk Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开CPU / Memory / Net / Disk 标签 + cy.openMainLabel("CPU / Memory / Net / Disk") + + // 当前面板 CPU + const cpuPropertys = [ + // 'System - Processes executing in kernel mode', + // 'User - Normal processes executing in user mode', + // 'Nice - Niced processes executing in user mode', + // 'Idle - Waiting for something to happen', + // 'Iowait - Waiting for I/O to complete', + // 'Irq - Servicing interrupts', + // 'Softirq - Servicing softirqs', + // 'Steal - Time spent in other operating systems when running in a virtualized environment' + ] + cy.panelFoldLineTableGteTest('CPU') + + // 当前面板 Memory Stack + // const memoryStackPropertys = [ + // 'Apps - Memory used by user-space applications', + // 'PageTables - Memory used to map between virtual and physical memory addresses', + // 'SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified', + // 'Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)', + // 'Cache - Parked file data (file content) cache', + // 'Buffers - Block device (e.g. harddisk) cache', + // 'Unused - Free memory unassigned', + // 'Swap - Swap space used', + // 'Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working', + // ] + const memoryStackPropertys = [] + cy.panelFoldLineTableGteTest('Memory Stack', memoryStackPropertys) + + // 当前面板 Network Traffic + const networkTrafficPropertys = [ + // 'eth0 - Receive', + // 'lo - Receive', + // 'eth0 - Transmit', + // 'lo - Transmit', + ] + cy.panelFoldLineTableGteTest('Network Traffic', networkTrafficPropertys) + + // 当前面板 Disk Space Used + const diskSpaceUsedPropertys = [ + // '/run/user/0', + // '/run', + // '/usr/local/sysom/server/builder/hotfix', + // '/', + ] + cy.panelFoldLineTableGteTest('Disk Space Used', diskSpaceUsedPropertys) + + // 当前面板 Disk IOps + const diskIopsPropertys = [ + // 'vda - Writes completed' + ] + cy.panelFoldLineTableGteTest('Disk IOps', diskIopsPropertys) + + // 当前面板 I/O Usage Read / Write + const ioUsageReadWritePropertys = [ + // 'vda - Successfully read bytes', + // 'vda - Successfully written bytes' + ] + cy.panelFoldLineTableGteTest('I/O Usage Read / Write', ioUsageReadWritePropertys) + + // 当前面板 I/O Utilization + const ioUtilizationPropertys = [ + // 'vda' + ] + cy.panelFoldLineTableGteTest('I/O Utilization', ioUtilizationPropertys) + + + }) + + it("Memory Meminfo Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Memory Meminfo 标签 + cy.openMainLabel("Memory Meminfo") + + // 当前面板 Memory Active / Inactive + const memoryActivePropertys = [ + // 'Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes', + // 'Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary' + ] + cy.panelFoldLineTableGteTest('Memory Active / Inactive', memoryActivePropertys) + + // 当前面板 Memory Commited + const memoryCommitedPropertys = [ + // 'Committed_AS - Amount of memory presently allocated on the system', + // 'CommitLimit - Amount of memory currently available to be allocated on the system' + ] + cy.panelFoldLineTableGteTest('Memory Commited', memoryCommitedPropertys) + + // 当前面板 Memory Active / Inactive Detail + const memoryActiveDetailPropertys = [ + // 'Inactive_file - File-backed memory on inactive LRU list', + // 'Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)', + // 'Active_file - File-backed memory on active LRU list', + // 'Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs' + ] + cy.panelFoldLineTableGteTest('Memory Active / Inactive Detail', memoryActiveDetailPropertys) + + // 当前面板 Memory Writeback and Dirty + const memoryWritebackAndDirtyPropertys = [ + // 'Writeback - Memory which is actively being written back to disk', + // 'WritebackTmp - Memory used by FUSE for temporary writeback buffers', + // 'Dirty - Memory which is waiting to get written back to the disk' + ] + cy.panelFoldLineTableGteTest('Memory Writeback and Dirty', memoryWritebackAndDirtyPropertys) + + // 当前面板 Memory Shared and Mapped + const memorySharedAndMappedPropertys = [ + // 'Mapped - Used memory in mapped pages files which have been mmaped, such as libraries', + // 'Shmem - Used shared memory (shared between several processes, thus including RAM disks)', + // 'ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages', + // 'ShmemPmdMapped - Ammount of shared (shmem/tmpfs) memory backed by huge pages' + ] + cy.panelFoldLineTableGteTest('Memory Shared and Mapped', memorySharedAndMappedPropertys) + + // 当前面板 Memory Slab + const memorySlabPropertys = [ + // 'SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure', + // 'SReclaimable - Part of Slab, that might be reclaimed, such as caches' + ] + cy.panelFoldLineTableGteTest('Memory Slab', memorySlabPropertys) + + // 当前面板 Memory Vmalloc + const memoryVmallocPropertys = [ + // "VmallocChunk - Largest contigious block of vmalloc area which is free", + // "VmallocTotal - Total size of vmalloc memory area", + // "VmallocUsed - Amount of vmalloc area which is used" + ] + cy.panelFoldLineTableGteTest('Memory Vmalloc', memoryVmallocPropertys) + + // 当前面板 Memory Bounce + const memoryBouncePropertys = [ + // 'Bounce - Memory used for block device bounce buffers' + ] + cy.panelFoldLineTableGteTest('Memory Bounce', memoryBouncePropertys) + + // 当前面板 Memory Anonymous + const memoryAnonymousPropertys = [ + // 'AnonHugePages - Memory in anonymous huge pages', + // 'AnonPages - Memory in user pages not backed by files' + ] + cy.panelFoldLineTableGteTest('Memory Anonymous', memoryAnonymousPropertys) + + // 当前面板 Memory Kernel / CPU + const memoryKernelCPUPropertys = [ + // 'KernelStack - Kernel memory stack. This is not reclaimable', + // 'PerCPU - Per CPU memory allocated dynamically by loadable modules' + ] + cy.panelFoldLineTableGteTest('Memory Kernel / CPU', memoryKernelCPUPropertys) + + // 当前面板 Memory HugePages Counter + const MemoryHugePagesCounterpropertys = [ + // 'HugePages_Free - Huge pages in the pool that are not yet allocated', + // 'HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made', + // 'HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages' + ] + cy.panelFoldLineTableGteTest('Memory HugePages Counter', MemoryHugePagesCounterpropertys) + + // 当前面板 Memory HugePages Size + const MemoryHugePagesSizePropertys = [ + // 'HugePages - Total size of the pool of huge pages', + // 'Hugepagesize - Huge Page size' + ] + cy.panelFoldLineTableGteTest('Memory HugePages Size', MemoryHugePagesSizePropertys) + + // 当前面板 Memory DirectMap + const MemoryDirectMapPropertys = [ + // 'DirectMap1G - Amount of pages mapped as this size', + // 'DirectMap2M - Amount of pages mapped as this size', + // 'DirectMap4K - Amount of pages mapped as this size' + ] + cy.panelFoldLineTableGteTest('Memory DirectMap', MemoryDirectMapPropertys) + + // 当前面板 Memory Unevictable and MLocked + const MemoryUnevictableandMLockedPropertys = [ + // "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", + // "MLocked - Size of pages locked to memory using the mlock() system call" + ] + cy.panelFoldLineTableGteTest('Memory Unevictable and MLocked', MemoryUnevictableandMLockedPropertys) + + // 当前面板 Memory NFS + const MemoryNFSPropertys = [ + // "NFS Unstable - Memory in NFS pages sent to the server, but not yet commited to the storage" + ] + cy.panelFoldLineTableGteTest('Memory NFS') + }) + + it("Memory Vmstat Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Memory Vmstat 标签 + cy.openMainLabel("Memory Vmstat") + + // 当前面板 Memory Pages In / Out + cy.panelFoldLineTableGteTest('Memory Pages In / Out') + + // 当前面板 Memory Pages Swap In / Out + cy.panelFoldLineTableGteTest('Memory Pages Swap In / Out') + + // 当前面板 Memory Page Faults + cy.panelFoldLineTableGteTest('Memory Page Faults') + + // 当前面板OOM Killer + cy.panelFoldLineTableGteTest('OOM Killer') + }) + + it("System Timesync Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 System Timesync 标签 + cy.openMainLabel("System Timesync") + + // 当前面板 Time Syncronized Drift + cy.panelFoldLineTableGteTest('Time Syncronized Drift') + + // 当前面板 Time PLL Adjust + cy.panelFoldLineTableGteTest('Time PLL Adjust') + + // 当前面板 Time Syncronized Status + cy.panelFoldLineTableGteTest('Time Syncronized Status') + + // 当前面板 Time Misc + cy.panelFoldLineTableGteTest('Time Misc') + }) + + it("System Processes Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 标签 + cy.openMainLabel("System Processes") + + // 当前面板 Processes Status + cy.panelFoldLineTableGteTest('Processes Status') + + // 当前面板 Processes State + cy.panelNoDataTest('Processes State') + + // 当前面板 Processes Forks + cy.panelFoldLineTableGteTest('Processes Forks') + + // 当前面板 Processes Memory + cy.panelFoldLineTableGteTest('Processes Memory') + + // 当前面板 PIDs Number and Limit + cy.panelNoDataTest('PIDs Number and Limit') + // cy.getPannelContentByTitle("PIDs Number and Limit").find("tbody tr").should("have.length.gte", 1) + + // 当前面板 Process schedule stats Running / Waiting + cy.panelFoldLineTableGteTest('Process schedule stats Running / Waiting') + + // 当前面板 Threads Number and Limit + cy.panelNoDataTest('Threads Number and Limit') + }) + + it("System Misc Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 System Misc 标签 + cy.openMainLabel("System Misc") + + // 当前面板 Context Switches / Interrupts + cy.panelFoldLineTableGteTest("Context Switches / Interrupts") + + // 当前面板 System Load + cy.panelFoldLineTableGteTest("System Load") + + // 当前面板 Interrupts Detail + cy.panelNoDataTest("Interrupts Detail") + + // 当前面板 Schedule timeslices executed by each cpu + cy.panelFoldLineTableGteTest("Schedule timeslices executed by each cpu") + + // 当前面板 Entropy + cy.panelFoldLineTableGteTest("Entropy") + + // 当前面板 CPU time spent in user and system contexts + cy.panelFoldLineTableGteTest("CPU time spent in user and system contexts") + + // 当前面板 File Descriptors + cy.panelFoldLineTableGteTest("File Descriptors") + }) + + it("Hardware Misc Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Hardware Misc 标签 + cy.openMainLabel("Hardware Misc") + + // 当前面板 Hardware temperature monitor + cy.panelNoDataTest("Hardware temperature monitor") + + // 当前面板 Throttle cooling device + cy.panelFoldLineTableGteTest("Throttle cooling device") + + // 当前面板 Power supply + cy.panelNoDataTest("Power supply") + }) + + it("Systemd Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Systemd 标签 + cy.openMainLabel("Systemd") + + // 当前面板 Systemd Sockets + cy.panelNoDataTest("Systemd Sockets") + + // 当前面板 Systemd Units State + cy.panelNoDataTest("Systemd Units State") + }) + + it("Storage Disk Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Storage Disk 标签 + cy.openMainLabel("Storage Disk") + + // 当前面板 Disk IOps Completed + cy.panelFoldLineTableGteTest("Disk IOps Completed") + + // 当前面板 Disk R/W Data + cy.panelFoldLineTableGteTest("Disk R/W Data") + + // 当前面板 Disk Average Wait Time + cy.panelFoldLineTableGteTest("Disk Average Wait Time") + + // 当前面板 Average Queue Size + cy.panelFoldLineTableGteTest("Average Queue Size") + + // 当前面板 Disk R/W Merged + cy.panelFoldLineTableGteTest("Disk R/W Merged") + + // 当前面板 Time Spent Doing I/Os + cy.panelFoldLineTableGteTest("Time Spent Doing I/Os") + + // 当前面板 Instantaneous Queue Size + cy.getPannelContentByTitle("Instantaneous Queue Size").find("canvas").should("have.length.gte", 1) + cy.getPannelContentByTitle("Instantaneous Queue Size").find("tbody").should("have.length.gte", 1) + + // 当前面板 Disk IOps Discards completed / merged + cy.getPannelContentByTitle("Disk IOps Discards completed / merged").find("canvas").should("have.length.gte", 1) + cy.getPannelContentByTitle("Disk IOps Discards completed / merged").find("tbody").should("have.length.gte", 1) + }) + + it("Storage Filesystem Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Storage Filesystem 标签 + + cy.openMainLabel("Storage Filesystem") + + // 当前面板 Filesystem space available + cy.panelFoldLineTableGteTest("Filesystem space available") + + // 当前面板 File Nodes Free + cy.panelFoldLineTableGteTest("File Nodes Free") + + // 当前面板 File Descriptor + cy.panelFoldLineTableGteTest("File Descriptor") + + // 当前面板 File Nodes Size + cy.panelFoldLineTableGteTest("File Nodes Size") + + // 当前面板 Filesystem in ReadOnly / Error + // cy.panelFoldLineTableGteTest("Filesystem in ReadOnly / Error") + cy.getPannelContentByTitle("Filesystem in ReadOnly / Error").find("canvas").should('be.empty') + cy.getPannelContentByTitle("Filesystem in ReadOnly / Error").find('tbody').should('be.empty') + }) + + it("Network Traffic Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Network Traffic 标签 + cy.openMainLabel("Network Traffic") + + // 当前面板 Network Traffic by Packets + cy.panelFoldLineTableGteTest("Network Traffic by Packets") + + // 当前面板 Network Traffic Errors + cy.panelFoldLineTableGteTest("Network Traffic Errors") + + // 当前面板 Network Traffic Drop + cy.panelFoldLineTableGteTest("Network Traffic Drop") + + // 当前面板 Network Traffic Compressed + cy.panelFoldLineTableGteTest("Network Traffic Compressed") + + // 当前面板 Network Traffic Multicast + cy.panelFoldLineTableGteTest("Network Traffic Multicast") + + // 当前面板 Network Traffic Fifo + cy.panelFoldLineTableGteTest("Network Traffic Fifo") + + // 当前面板 Network Traffic Frame + cy.panelFoldLineTableGteTest("Network Traffic Frame") + + // 当前面板 Network Traffic Carrier + cy.panelFoldLineTableGteTest("Network Traffic Carrier") + + // 当前面板 Network Traffic Colls + cy.panelFoldLineTableGteTest("Network Traffic Colls") + + // 当前面板 NF Contrack + cy.panelNoDataTest("NF Contrack") + cy.wait(4000) + + // 当前面板 ARP Entries + cy.panelFoldLineTableGteTest("ARP Entries") + + // 当前面板 MTU + cy.panelFoldLineTableGteTest("MTU") + + // 当前面板 Speed + cy.panelFoldLineTableGteTest("Speed") + + // 当前面板 Queue Length + cy.panelFoldLineTableGteTest("Queue Length") + + // 当前面板 Softnet Packets + cy.panelFoldLineTableGteTest("Softnet Packets") + + // 当前面板 Softnet Out of Quota + cy.panelFoldLineTableGteTest("Softnet Out of Quota") + + // 当前面板 Network Operational Status + cy.panelFoldLineTableGteTest("Network Operational Status") + + }) + + it("Network Sockstat Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Network Sockstat 标签 + cy.openMainLabel("Network Sockstat") + + // 当前面板 Sockstat TCP + cy.panelFoldLineTableGteTest("Sockstat TCP") + + // 当前面板 Sockstat UDP + cy.panelFoldLineTableGteTest("Sockstat UDP") + + // 当前面板 Sockstat Used + cy.panelFoldLineTableGteTest("Sockstat Used") + + // 当前面板 Sockstat Memory Size + cy.panelFoldLineTableGteTest("Sockstat Memory Size") + + // 当前面板 Sockstat FRAG / RAW + cy.panelFoldLineTableGteTest("Sockstat FRAG / RAW") + }) + + it("Network Netstat Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Network Netstat 标签 + cy.openMainLabel("Network Netstat") + + // 当前面板 Netstat IP In / Out Octets + cy.panelFoldLineTableGteTest("Netstat IP In / Out Octets") + + // 当前面板 Netstat IP Forwarding + cy.panelFoldLineTableGteTest("Netstat IP Forwarding") + + // 当前面板 ICMP In / Out + cy.panelFoldLineTableGteTest("ICMP In / Out") + + // 当前面板 ICMP Errors + cy.panelFoldLineTableGteTest("ICMP Errors") + + // 当前面板 UDP In / Out + cy.panelFoldLineTableGteTest("UDP In / Out") + + // 当前面板 UDP Errors + cy.panelFoldLineTableGteTest("UDP Errors") + + // 当前面板 TCP In / Out + cy.panelFoldLineTableGteTest("TCP In / Out") + + // 当前面板 TCP Errors + cy.panelFoldLineTableGteTest("TCP Errors") + + // 当前面板 TCP Connections + cy.panelFoldLineTableGteTest("TCP Connections") + + // 当前面板 TCP SynCookie + cy.panelFoldLineTableGteTest("TCP SynCookie") + + // 当前面板 TCP Direct Transition + cy.panelFoldLineTableGteTest("TCP Direct Transition") + }) + + it("Node Exporter Test", () => { + // 1. 访问集群监控页面 + cy.visit("/monitor/dashboard"); + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 关闭Quick CPU / Mem / Disk标签 + cy.openMainLabel("Quick CPU / Mem / Disk") + + // 4. 打开 Node Exporter 标签 + cy.openMainLabel("Node Exporter") + + // 当前面板 Node Exporter Scrape Time + cy.panelFoldLineTableGteTest("Node Exporter Scrape Time") + + // 当前面板 Node Exporter Scrape + cy.panelFoldLineTableGteTest("Node Exporter Scrape") + }) +}) diff --git a/sysom_web/cypress/e2e/monitor/migrate_monitor.cy.js b/sysom_web/cypress/e2e/monitor/migrate_monitor.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..b8d3088b055c11d04765abc86ad55d7ca12878b3 --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/migrate_monitor.cy.js @@ -0,0 +1,253 @@ +/// + +describe("SysOM Migration Monitor Dashboard Test", () => { + beforeEach(() => { + cy.login() + }) + + it("Migration monitor test", () => { + // 1. 访问主机列表也米娜 + cy.visit("/monitor/migration"); + + // 2. 等待页面加载完成 + cy.wait(2000); + + // 当前内核版本(Stat面板数值类型) + cy.getPannelContentByTitle("当前内核版本").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 当前内存可用总量(Stat面板数值类型) + cy.getPannelContentByTitle("当前内存可用总量").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 当前大页内存总量(Stat面板数值类型) + cy.getPannelContentByTitle("当前大页内存总量").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 当前磁盘可用空间总量(Stat面板数值类型) + cy.getPannelContentByTitle("当前磁盘可用空间总量").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 当前磁盘个数(Stat面板数值类型) + cy.getPannelContentByTitle("当前磁盘个数").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 当前网卡数量(Stat面板数值类型) + cy.getPannelContentByTitle("当前网卡数量").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 当前启用网卡数量(Stat面板数值类型) + cy.getPannelContentByTitle("当前启用网卡数量").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 可用内存(Time series 面板) + cy.getPannelContentByTitle("可用内存").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("可用内存").find("tbody tr").eq(0).find("td").eq(0).contains("total"); // 第一列的第一行是 total + cy.getPannelContentByTitle("可用内存").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + cy.getPannelContentByTitle("可用内存").find("tbody tr").eq(1).find("td").eq(0).contains("free"); // 第一列的第二行是 free + cy.getPannelContentByTitle("可用内存").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 可用磁盘空间(Time series 面板) + cy.getPannelContentByTitle("可用磁盘空间").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("可用磁盘空间").find("tbody tr").eq(0).find("td").eq(0).contains("total"); // 第一列的第一行是 total + cy.getPannelContentByTitle("可用磁盘空间").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + cy.getPannelContentByTitle("可用磁盘空间").find("tbody tr").eq(1).find("td").eq(0).contains("avaliable"); // 第一列的第二行是 avaliable + cy.getPannelContentByTitle("可用磁盘空间").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // CPUs(Stat面板数值类型) + cy.getPannelContentByTitle("CPUs").contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // CPU 利用率(Time series 面板) + cy.getPannelContentByTitle("CPU 利用率").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + cy.getPannelContentByTitle("CPU 利用率").find("tbody tr").eq(0).find("td").eq(0).contains("今日"); // 第一列的第一行是 “今日” + cy.getPannelContentByTitle("CPU 利用率").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // CPU 利用率两日差值(Time series 面板)=> 如果无数据也是正常的,可以用下列这种方式分情况判断 + cy.getPannelContentByTitle("CPU 利用率两日差值") + .then(($el) => { + if ($el.text().includes("No data")) { + // 面板没有数据的情况 + cy.wrap($el).contains("No data"); + } else { + // 如果面板有数据的情况 + cy.wrap($el).find("tbody tr").should("have.length.gte", 1); // Legend 至少有1列 + cy.wrap($el).find("tbody tr").eq(0).find("td").eq(0).contains("CPU 利用率差值"); // 第一列的第一行是 “CPU 利用率差值” + cy.wrap($el).find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + // cy.wrap($el).find("tbody tr").eq(1).find("td").eq(0).contains("stdvar"); // 第二列的第一行是 “CPU 利用率差值” + // cy.wrap($el).find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + // const num = parseFloat($el.text()); + // expect(num).to.be.gte(0); + // }); + } + }); + + // 实时 CPU 利用率(Gauge) + cy.getPannelContentByTitle("实时 CPU 利用率").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // CPU 利用率两日分布(Histogram) + cy.getPannelContentByTitle("CPU 利用率两日分布").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + cy.getPannelContentByTitle("CPU 利用率两日分布").find("tbody tr").eq(0).find("td").eq(0).contains("今日"); // 第一列的第一行是 “今日” + cy.getPannelContentByTitle("CPU 利用率两日分布").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // CPU利用率波动(Time series 面板) + cy.getPannelContentByTitle("CPU利用率波动").find("tbody tr").should("have.length", 1); // Legend 有一列 + cy.getPannelContentByTitle("CPU利用率波动").find("tbody tr").eq(0).find("td").eq(0).contains("标准差"); // 第一列的第一行是 “标准差” + cy.getPannelContentByTitle("CPU利用率波动").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 实时内存使用率(Gauge) + cy.getPannelContentByTitle("实时内存使用率").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 内存使用率(Memory Usage)(Time series 面板) + cy.getPannelContentByTitle("内存使用率(Memory Usage)").find("tbody tr").should("have.length", 1); // Legend 有一列 + cy.getPannelContentByTitle("内存使用率(Memory Usage)").find("tbody tr").eq(0).find("td").eq(0).contains("Memory utilization rate"); // 第一列的第一行是 “Memory utilization rate” + cy.getPannelContentByTitle("内存使用率(Memory Usage)").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.greaterThan(0); + }); + + // 内存使用率波动(Memory Usage Rate Fluctuation)(Time series 面板) + cy.getPannelContentByTitle("内存使用率波动(Memory Usage Rate Fluctuation)").find("tbody tr").should("have.length", 1); // Legend 有一列 + cy.getPannelContentByTitle("内存使用率波动(Memory Usage Rate Fluctuation)").find("tbody tr").eq(0).find("td").eq(0).contains("标准差(stddev)"); // 第一列的第一行是 “标准差” + cy.getPannelContentByTitle("内存使用率波动(Memory Usage Rate Fluctuation)").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 磁盘空间总额(Pie Chart) + cy.getPannelContentByTitle("磁盘空间总额").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + // 磁盘空间使用率(Time series 面板) + cy.getPannelContentByTitle("磁盘空间使用率").find("ul li").should("have.length.gte", 1); // Legend 至少有一列 + + // Disk Read IOps (Time series 面板) + cy.getPannelContentByTitle("Disk Read IOps").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + // Disk R/W Data (Time series 面板) + cy.getPannelContentByTitle("Disk R/W Data").find("tbody tr").should("have.length.gte", 1); // Legend 至少有一列 + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // Network + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // 实时接收速率(Stat面板) + cy.getPannelContentByTitle("实时接收速率").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 网络流量监控 + cy.getPannelContentByTitle("网络流量监控").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("网络流量监控").find("tbody tr").eq(0).find("td").eq(0).contains("接收速率"); // 第一列的第一行是 “接收速率” + cy.getPannelContentByTitle("网络流量监控").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.getPannelContentByTitle("网络流量监控").find("tbody tr").eq(1).find("td").eq(0).contains("发送速率"); // 第二列的第一行是 “发送速率” + cy.getPannelContentByTitle("网络流量监控").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.lte(0); + }); + + // 网络流量波动监测 + cy.getPannelContentByTitle("网络流量波动监测").find("tbody tr").should("have.length", 2); // Legend 有两列 + cy.getPannelContentByTitle("网络流量波动监测").find("tbody tr").eq(0).find("td").eq(0).contains("接收速率标准差"); // 第一列的第一行是 “接收速率标准差” + cy.getPannelContentByTitle("网络流量波动监测").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.getPannelContentByTitle("网络流量波动监测").find("tbody tr").eq(1).find("td").eq(0).contains("发送速率标准差"); // 第二列的第一行是 “发送速率标准差” + cy.getPannelContentByTitle("网络流量波动监测").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.lte(0); + }); + + // 实时发送速率(Stat面板) + cy.getPannelContentByTitle("实时发送速率").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // System Load + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // 实时系统负载(1分钟)(Gauge) + cy.getPannelContentByTitle("实时系统负载(1分钟)").contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 系统负载(Time series 面板) + cy.getPannelContentByTitle("系统负载").find("tbody tr").should("have.length", 3); // Legend 有三列 + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(0).find("td").eq(0).contains("1分钟内负载(Load1)"); // 第一列的第一行是 "1分钟内负载(Load1)" + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第一列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(1).find("td").eq(0).contains("5分钟内负载(Load5)"); // 第二列的第一行是 "5分钟内负载(Load5)" + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(1).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(2).find("td").eq(0).contains("15分钟内负载(Load15)"); // 第三列的第一行是 "15分钟内负载(Load15)" + cy.getPannelContentByTitle("系统负载").find("tbody tr").eq(2).find("td").eq(1).contains(/\d+/).then(($el) => { // 第三列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + + // 系统负载波动(Load1)(Time series 面板) + cy.getPannelContentByTitle("系统负载波动(Load1)").find("tbody tr").should("have.length", 1); // Legend 有一列 + cy.getPannelContentByTitle("系统负载波动(Load1)").find("tbody tr").eq(0).find("td").eq(0).contains("标准差"); // 第一列的第一行是 “标准差” + cy.getPannelContentByTitle("系统负载波动(Load1)").find("tbody tr").eq(0).find("td").eq(1).contains(/\d+/).then(($el) => { // 第二列的第二行是数值 + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/monitor/nginx_monitor.cy.js b/sysom_web/cypress/e2e/monitor/nginx_monitor.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..3c824c352aad266c00ba3206e6c8048f2525ab0c --- /dev/null +++ b/sysom_web/cypress/e2e/monitor/nginx_monitor.cy.js @@ -0,0 +1,136 @@ +/// + +describe("SysOM Nginx Observer Dashboard Test", () => { + beforeEach(() => { + cy.login(); + }) + + it("Nginx Observer test", () => { + // 1. 访问集群监控页面 + cy.visit("app_observable/Nginx"); + + // 2. 等待页面加载完成 + cy.wait(2000); + + // 3. 检查异常告警pannel + cy.getPannelContentByTitle("异常告警分布(次数)").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).contains("请求抖动") + cy.wrap($el).contains("请求4xx") + cy.wrap($el).contains("请求5xx") + cy.wrap($el).contains("错误日志") + } + }); + + // 4. 检查请求数pannel + cy.getPannelContentByTitle("请求数").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains("requests"); + } + }); + + // 5. 检查http status分布pannel + cy.getPannelContentByTitle("http status分布").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains("status_1xx"); + cy.wrap($el).find("ul li").eq(1).contains("status_2xx"); + cy.wrap($el).find("ul li").eq(2).contains("status_3xx"); + cy.wrap($el).find("ul li").eq(3).contains("status_4xx"); + cy.wrap($el).find("ul li").eq(4).contains("status_5xx"); + } + }); + + + // 6. 检查响应时延pannel + cy.getPannelContentByTitle("响应时延").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains("requestTime"); + cy.wrap($el).find("ul li").eq(1).contains("upstreamTime"); + cy.wrap($el).find("ul li").eq(2).contains("maxRequestTime"); + cy.wrap($el).find("ul li").eq(3).contains("maxUpstreamTime"); + } + }); + + // 7. 检查workers数量pannel + cy.getPannelContentByTitle("workers数量").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").find("ul li").eq(0).contains("workersCount"); + } + }); + + // 8. 检查活跃的连接数pannel + cy.getPannelContentByTitle("活跃的连接数").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains("activeConnections"); + } + }); + + // 9. 检查nginx进程cpu利用率pannel + cy.getPannelContentByTitle("nginx进程cpu利用率").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.wrap($el).find("ul li").eq(1).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + } + }); + + // 10. 检查nginx进程内存利用率pannel + cy.getPannelContentByTitle("nginx进程内存利用率").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.wrap($el).find("ul li").eq(1).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + } + }); + + // 11. 检查nginx进程网络流量pannel + cy.getPannelContentByTitle("nginx进程网络流量").then($el => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data") + } else { + cy.wrap($el).find("ul li").should("have.length.gte", 1); + cy.wrap($el).find("ul li").eq(0).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + cy.wrap($el).find("ul li").eq(1).contains(/\d+/).then(($el) => { + const num = parseFloat($el.text()); + expect(num).to.be.gte(0); + }); + } + }); + }) +}) \ No newline at end of file diff --git a/sysom_web/cypress/e2e/vmcore/vmcore_config.cy.js b/sysom_web/cypress/e2e/vmcore/vmcore_config.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..306789faf7aa5cb00527e0b9c7a0bb22a6f13e02 --- /dev/null +++ b/sysom_web/cypress/e2e/vmcore/vmcore_config.cy.js @@ -0,0 +1,24 @@ +/// + +describe("vmcore list page", () => { + it("show vmcore list", () => { + cy.intercept("POST","/api/v1/vmcore/vmcore_config_test").as("vmcoreTest") + cy.login() + cy.visit("/vmcore/config") + /* ==== Generated with Cypress Studio ==== */ + cy.get('#name').clear(); + cy.get('#name').type("testconfig"); + cy.get('#server_host').clear(); + cy.get('#server_host').type("127.0.0.1"); + cy.get('#mount_point').clear(); + cy.get('#mount_point').type("/tmp/vmcore-nfs"); + cy.get('#days').clear(); + /* ==== Generated with Cypress Studio ==== */ + cy.get('.ml20___wnEIC').click(); + cy.get(':nth-child(1) > .ant-descriptions-item > .ant-descriptions-item-container > .ant-descriptions-item-content').should('have.text',"testconfig"); + cy.get(':nth-child(2) > .ant-descriptions-item > .ant-descriptions-item-container > .ant-descriptions-item-content').should('have.text',"127.0.0.1"); + cy.get(':nth-child(3) > .ant-descriptions-item > .ant-descriptions-item-container > .ant-descriptions-item-content').should('have.text',"/tmp/vmcore-nfs"); + //cy.get('.configleft17___1xVO3').click(); + /* ==== End Cypress Studio ==== */ + }) +}) diff --git a/sysom_web/cypress/e2e/vmcore/vmcore_list.cy.js b/sysom_web/cypress/e2e/vmcore/vmcore_list.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..93269ba35edbbcf1077fa1ee346a4e1e9d8817a2 --- /dev/null +++ b/sysom_web/cypress/e2e/vmcore/vmcore_list.cy.js @@ -0,0 +1,14 @@ +/// + +describe("vmcore list page", () => { + it("show vmcore list", () => { + cy.login() + cy.visit("/vmcore/list") + /* ==== Generated with Cypress Studio ==== */ + cy.get(':nth-child(1) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value > .ant-statistic-content-value-int').invoke('text').should('match',/^\d+/); + cy.get(':nth-child(3) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value > .ant-statistic-content-value-int').invoke('text').should('match',/^\d+/); + cy.get(':nth-child(5) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value ').invoke('text').should('match',/^\d+\.\d+$/); + cy.get(':nth-child(7) > .ant-pro-card > .ant-pro-card-body > .ant-statistic > .ant-statistic-content > .ant-statistic-content-value ').invoke('text').should('match',/^\d+\.\d+$/); + /* ==== End Cypress Studio ==== */ + }) +}) diff --git a/sysom_web/cypress/e2e/vmcore/vmcore_match.cy.js b/sysom_web/cypress/e2e/vmcore/vmcore_match.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..f7c1825a4f8850e48f509ca92686337828656da9 --- /dev/null +++ b/sysom_web/cypress/e2e/vmcore/vmcore_match.cy.js @@ -0,0 +1,17 @@ +/// + +describe("vmcore list page", () => { + it("show vmcore list", () => { + cy.login() + cy.visit("/vmcore/match") + /* ==== Generated with Cypress Studio ==== */ + cy.get('#similar_dmesg').clear(); + //let inputText = " [6231329.685410] Kernel panic - not syncing: sysrq triggered crash \n[6231329.685933] CPU: 3 PID: 514785 Comm: bash Kdump: loaded Tainted: G W E 5.10.134-14.an8.x86_64 #1 \n[6231329.686813] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 449e491 04/01/2014 \n[6231329.687515] Call Trace: \n[6231329.687759] dump_stack+0x57/0x6e\n [6231329.688070] panic+0x10d/0x2e9 \n[6231329.688392] sysrq_handle_crash+0x16/0x20\n [6231329.688759] __handle_sysrq.cold.18+0x7a/0xe8\n [6231329.689159] write_sysrq_trigger+0x2b/0x40 \n [6231329.689548] proc_reg_write+0x3b/0x80 \n[6231329.689887] vfs_write+0xb5/0x260 \n[6231329.690203] ksys_write+0x49/0xc0 \n[6231329.690523] do_syscall_64+0x33/0x40 \n[6231329.690853] entry_SYSCALL_64_after_hwframe+0x61/0xc6 \n[6231329.691325] RIP: 0033:0x7f4d40d205a8 "; + //cy.get("#similar_dmesg").invoke('val', inputText) + //cy.get('#similar_dmesg').type( " [6231329.685410] Kernel panic - not syncing: sysrq triggered crash \n[6231329.685933] CPU: 3 PID: 514785 Comm: bash Kdump: loaded Tainted: G W E 5.10.134-14.an8.x86_64 #1 \n[6231329.686813] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 449e491 04/01/2014 \n[6231329.687515] Call Trace: \n[6231329.687759] dump_stack+0x57/0x6e\n [6231329.688070] panic+0x10d/0x2e9 \n[6231329.688392] sysrq_handle_crash+0x16/0x20\n [6231329.688759] __handle_sysrq.cold.18+0x7a/0xe8\n [6231329.689159] write_sysrq_trigger+0x2b/0x40 \n [6231329.689548] proc_reg_write+0x3b/0x80 \n[6231329.689887] vfs_write+0xb5/0x260 \n[6231329.690203] ksys_write+0x49/0xc0 \n[6231329.690523] do_syscall_64+0x33/0x40 \n[6231329.690853] entry_SYSCALL_64_after_hwframe+0x61/0xc6 \n[6231329.691325] RIP: 0033:0x7f4d40d205a8 "); + cy.get('#similar_dmesg').type("123") + cy.get(':nth-child(2) > .ant-btn').click(); + cy.get('table > tbody').eq(0).children().should('have.length.of.at.least',1) + /* ==== End Cypress Studio ==== */ + }) +}) diff --git a/sysom_web/cypress/e2e/vul/repair.cy.js b/sysom_web/cypress/e2e/vul/repair.cy.js index 159c68945cb33430b187e7cdf1969796d7e231df..aed8320e238c4ae3940ba341cf28c6557bbf52a7 100644 --- a/sysom_web/cypress/e2e/vul/repair.cy.js +++ b/sysom_web/cypress/e2e/vul/repair.cy.js @@ -1,16 +1,16 @@ /// describe("SysOM Vul Manager Test", () => { - - it("login failed", () => { - - cy.login() + beforeEach(() => { + cy.login() + }) + it("repair test", () => { cy.intercept("POST", "api/v1/vul/") .as("repair") // 1. 访问安全中心列表 - cy.visit("security/list") + cy.visit("/security/list") //2.选择第一条数据,点击修复按钮 cy.get('table').within(() => { diff --git a/sysom_web/cypress/support/commands.js b/sysom_web/cypress/support/commands.js index 8bc4b4defe6a036da3913be8979ff5b4d0a44673..e141dfc21b4b1499b42dab0d10cfb1def65f4130 100644 --- a/sysom_web/cypress/support/commands.js +++ b/sysom_web/cypress/support/commands.js @@ -24,12 +24,66 @@ // -- This will overwrite an existing command -- // Cypress.Commands.overwrite('visit', (originalFn, url, options) => { ... }) +Cypress.on('uncaught:exception', (err, runnable) => { + // return false to prevent the error from failing the test if it matches + // the specific error message + if (err.message.includes('ResizeObserver loop completed with undelivered notifications')) { + return false; + } + // else let Cypress handle the exception as it normally does + return true; +}); + + Cypress.Commands.add("login", () => { cy.visit("/user/login") cy.get("#username").focus().clear().type(Cypress.env("SYSOM_ACCOUNT_USERNAME")) cy.get("#password").focus().clear().type(Cypress.env("SYSOM_ACCOUNT_PASSWORD")) + let password = Cypress.env("SYSOM_ACCOUNT_PASSWORD") cy.get("button").contains("登录").click() - cy.get("button").contains("忽 略").click() + if (password == "123456") { + cy.get("button").contains("忽 略").click() + } else { + cy.wait(2000) + } +}) + +Cypress.Commands.add('addDefaultHost', () => { + cy.intercept("POST", "/api/v1/host/") + .as("createHost") + + // 1. 点击新建主机打开模块框 + cy.get("button").contains("新建主机").click() + + // 2. 在模态框内部填充字段 + cy.get(".ant-modal-content").first().within(() => { + // 1.1 cluster + // cy.get("#cluster").focus().type("default") + cy.get('#cluster').type("default").type("{enter}", {force: true}) + + // 2.2 hostname + cy.get("#hostname").focus().clear().type("local") + + // 2.3 username + cy.get("#username").focus().clear().type("root") + + // 2.4 password + const default_host_password = Cypress.env("DEFAULT_HOST_PASSWORD") + cy.get("#host_password").focus().clear().type(default_host_password) + + // 2.5 ip + cy.get("#ip").focus().clear().type("127.0.0.1") + + // 2.6 port + cy.get("#port").focus().clear().type("22") + + // 2.7 确认 + cy.get("button").contains("确 认").click() + + // 2.8 等待新建主机请求结束,判断请求是否成功 + // 检查状态码返回是否是200(如果集群已经存在会返回400) + cy.wait('@createHost').its("response.statusCode").should("eq", 200) + }) }) /** @@ -39,6 +93,8 @@ Cypress.Commands.add("login", () => { * @param {*} resultCheckCallback 诊断结果处理(在此处判断诊断结果是否符合预期) */ Cypress.Commands.add("sysomDiagnosisCheck", (pageUrl, params, resultCheckCallback) => { + cy.intercept("GET", "/api/v1/host") + .as("getHostList") cy.intercept("POST", "/api/v1/tasks/") .as("createDiagnosisTask") @@ -50,11 +106,44 @@ Cypress.Commands.add("sysomDiagnosisCheck", (pageUrl, params, resultCheckCallbac // 1. 访问自定义诊断页面 cy.visit(pageUrl) + cy.wait(1000) + + cy.wait('@getHostList') + .then((interception) => { + expect(interception).to.have.property('response') + expect(interception.response?.body.code, 'code').to.equal(200) + expect(interception.response.statusCode).to.equal(200) + const { data } = interception.response.body + const ipList = data.map((item) => { + return item.ip + }) + + const defaultHostIpOne = Cypress.env("HOSTS")[0] + const defaultHostIpTwo = Cypress.env("HOSTS")[1] + + if (ipList.includes(defaultHostIpOne) && ipList.includes(defaultHostIpTwo)) { + expect(defaultHostIpOne).to.be.oneOf(ipList) + expect(defaultHostIpTwo).to.be.oneOf(ipList) + } else { + // 1. 跳转到主机列表页面 + cy.visit("/host/list") + cy.wait(1000) + + // 2. 添加默认主机 + cy.addDefaultHost() + cy.wait(1000) + + // 返回当前页面 + cy.visit(pageUrl) + cy.wait(30000) + } + }) // 2 输入instance参数 for (let k in params) { if (k.indexOf("instance") != -1) { cy.get(`#${k}`).parent().parent().click() + cy.get(`#${k}`).type(params[k]) cy.get(".rc-virtual-list-holder-inner").contains(params[k]).click() } else { cy.get(`#${k}`).invoke("attr", "readonly").then(res => { @@ -63,7 +152,7 @@ Cypress.Commands.add("sysomDiagnosisCheck", (pageUrl, params, resultCheckCallbac cy.get(`#${k}`).parent().parent().click() cy.get(".rc-virtual-list-holder-inner").contains(params[k]).click() } else { - cy.get(`#${k}`).focus().clear().type(params[k], {force: true}) + cy.get(`#${k}`).focus().clear().type(params[k], { force: true }) } }) } @@ -78,7 +167,8 @@ Cypress.Commands.add("sysomDiagnosisCheck", (pageUrl, params, resultCheckCallbac const getAndCheckTaskResult = (task_id) => { // 点击一下刷新按钮 - cy.get('span[aria-label="reload"]').click() + //cy.get('span[aria-label="reload"]').click({"multiple": true}) + cy.get('svg[data-icon="reload"]').first().click({"multiple": true}) cy.wait("@getDiagnosisTasks").its("response.statusCode").should("eq", 200) cy.wait(100) cy.get("td") @@ -123,4 +213,155 @@ Cypress.Commands.add("sysomDiagnosisCheck", (pageUrl, params, resultCheckCallbac // 轮询获取诊断结果 getAndCheckTaskResult(task_id) }) -}) \ No newline at end of file +}); + +Cypress.Commands.add("getIframeBody", (iframe_selector = "iframe") => { + cy.get(iframe_selector) + // Cypress yields jQuery element, which has the real + // DOM element under property "0". + // From the real DOM iframe element we can get + // the "document" element, it is stored in "contentDocument" property + // Cypress "its" command can access deep properties using dot notation + // https://on.cypress.io/its + .its('0.contentDocument') + .should('exist') + // automatically retries until body is loaded + .its('body').should('not.be.undefined') + // wraps "body" DOM element to allow + // chaining more Cypress commands, like ".find(...)" + // https://on.cypress.io/wrap + .then((body) => cy.wrap(body, { log: false })) +}); + +Cypress.Commands.add("getPannelByTitle", (title) => { + return cy.getIframeBody() + .find("div.react-grid-layout") + .find("div.react-grid-item") + .find("div.panel-header") + .filter((_, element) => { + return Cypress.$(element).text().trim() == title + }) + .parents("div.react-grid-item") +}); + +Cypress.Commands.add("getPannelHeaderByTitle", (title) => { + return cy.getPannelByTitle(title).find("div.panel-header") +}); + +Cypress.Commands.add("getPannelContentByTitle", (title) => { + return cy.getPannelByTitle(title).find("div.panel-content").scrollIntoView(); +}); + +/** + * SysOM 日志模块测试封装 + * @param {*} pageUrl 页面地址 + * @param {*} params 列表过滤参数 + * @param {*} resultCallback 结果回调, 返回结果是否达到预期 + */ +Cypress.Commands.add("sysomLogSelectOrFilter", (pageUrl, params, isSelect) => { + // 1. 跳转到自定义log页面 + cy.visit(pageUrl) + + // 2. 等待页面加载完成 + cy.wait(2000) + + // 3. 查找filter展开按钮并点击,若没有则跳过 + if (isSelect) { + const el_show = 'a[class="ant-pro-form-collapse-button"]' + cy.get(el_show).contains("展开").click() + } + + // 4. 输入查询参数 + for (let param in params) { + cy.get(`input[id=${param}]`).focus().type(params[param], {force: true}) + } + + // 5. 点击查询按钮 + cy.get(':nth-child(2) > .ant-btn > span').click() + + // 6. table 内容断言 + cy.get('.ant-table-tbody').find('tr').should("have.length.gte", 1) + +}) + +/** + * 打开闭合的主标签 + * @param {*} title panel tag + */ +Cypress.Commands.add("openMainLabel", (title) => { + cy.getIframeBody() + .find("div.react-grid-layout") + .find("div.react-grid-item") + .find("a") + .contains(title).click() +}) + +/* +* 面板数值大于等于 0 +* @param {*} title panel tag +*/ +Cypress.Commands.add("panelNumericalValueGteTest", (title) => { + cy.getPannelContentByTitle(title).contains(/\d+/).then(($el) => { + const num = parseInt($el.text()); + expect(num).to.be.gte(0); + }) +}) + +/** + * 面板取a便签数 大于等于 0 + * @param {*} title panel tag + */ + +Cypress.Commands.add("panelAtagValueGteTest", (title) => { + cy.getPannelContentByTitle(title).find("a").should("have.length.gte", 0) +}) + +/** + * 面板取a便签数 大于等于 0, 无数据时显示No data + * @param {*} title panel tag + */ +Cypress.Commands.add("panelAtagValueGteOrNoDataTest", (title) => { + cy.getPannelContentByTitle(title) + .then(($el) => { + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.wrap($el).find("a").should("have.length.gte", 0) + } + }) +}) + +/** + * 面板取折线统计图且具有Table + * @params {*} title panel tag + * @items {*} items 属性名称 列表 + */ +Cypress.Commands.add("panelFoldLineTableGteTest", (title, Array) => { + cy.getPannelContentByTitle(title).find("tbody tr").should("have.length.gt", 0) + if (Array !== undefined) { + Array.forEach((item, index) => { + cy.getPannelContentByTitle(title).find("tbody tr").eq(index).find("td").eq(0).contains(item) + }) + } +}) + +Cypress.Commands.add("panelNoDataTest", (title) => { + cy.getPannelContentByTitle(title).then(($el) => { + console.log($el.text()) + if ($el.text().includes("No data")) { + cy.wrap($el).contains("No data"); + } else { + cy.getPannelContentByTitle(title).find("canvas").should('be.empty') + } + }) +}) + +Cypress.Commands.add("diagnosisTaskResultHandler", (result, callback) => { + if (result.status === "Success") { + callback && callback() + } else { + cy.get('.ant-modal-confirm-title').should("include.text", "诊断失败") + cy.get("button").should("include.text", "OK") + cy.get("button").contains("OK").click() + } +}) diff --git a/sysom_web/package.json b/sysom_web/package.json index cc68dd05f33fcd4176291e274161604a107e083e..a361f2f06d727013961819753f7c9e78d5abb7b7 100644 --- a/sysom_web/package.json +++ b/sysom_web/package.json @@ -85,7 +85,8 @@ "xterm": "^5.1.0", "xterm-addon-attach": "^0.8.0", "xterm-addon-fit": "^0.7.0", - "xterm-addon-web-links": "^0.8.0" + "xterm-addon-web-links": "^0.8.0", + "zustand": "^4.4.7" }, "devDependencies": { "@types/express": "^4.17.0", diff --git a/sysom_web/public/resource/app_observable/v1/locales.json b/sysom_web/public/resource/app_observable/v1/locales.json new file mode 100644 index 0000000000000000000000000000000000000000..bce29568b96435066acbf3428e39f36331a8ed69 --- /dev/null +++ b/sysom_web/public/resource/app_observable/v1/locales.json @@ -0,0 +1,27 @@ +{ + "version": 1.0, + "menus": [ + "menu.app_observable.ntopo", + "menu.app_observable.mysql", + "menu.app_observable.nginx", + "menu.app_observable.process_app" + ], + "locales": { + "zh-CN": { + "menu.app_observable.mysql": "MySQL应用观测", + "menu.app_observable.nginx": "Nginx应用观测", + "menu.app_observable.ntopo": "网络拓扑", + "menu.app_observable.process_app": "Java可观测", + "pages.app_observable.monitor_dashboard": "监控面板", + "pages.app_observable.abnormal_events": "异常事件" + }, + "en-US": { + "menu.app_observable.ntopo": "Network Topology", + "menu.app_observable.mysql": "MySQL Observability", + "menu.app_observable.nginx": "Nginx Observability", + "menu.app_observable.process_app": "Java Observability", + "pages.app_observable.monitor_dashboard": "Monitor Dashboard", + "pages.app_observable.abnormal_events": "Abnormal Events" + } + } +} \ No newline at end of file diff --git a/sysom_web/public/resource/app_observable/v1/mysql.json b/sysom_web/public/resource/app_observable/v1/mysql.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae4d13d5b9b5e7449376f88d6d814404ad817a6 --- /dev/null +++ b/sysom_web/public/resource/app_observable/v1/mysql.json @@ -0,0 +1,16 @@ +{ + "menuName": "menu.app_observable.mysql", + "type": "multiGrafanaPannel", + "config": [ + { + "pannelId": "mysql_monitor", + "pannelName": "pages.app_observable.monitor_dashboard", + "pannelUrl": "/grafana/d/hOk70b34k/app-mysql" + }, + { + "pannelId": "mysql_event", + "pannelName": "pages.app_observable.abnormal_events", + "pannelUrl": "/grafana/d/Ub__1x3Vz/app-mysql-events" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/app_observable/v1/nginx.json b/sysom_web/public/resource/app_observable/v1/nginx.json new file mode 100644 index 0000000000000000000000000000000000000000..8a2c4e2073d40da999ff311b12606aa9b153ee0b --- /dev/null +++ b/sysom_web/public/resource/app_observable/v1/nginx.json @@ -0,0 +1,16 @@ +{ + "menuName": "menu.app_observable.nginx", + "type": "multiGrafanaPannel", + "config": [ + { + "pannelId": "nginx_monitor", + "pannelName": "pages.app_observable.monitor_dashboard", + "pannelUrl": "/grafana/d/6Mztrm4Ik/nginx" + }, + { + "pannelId": "nginx_event", + "pannelName": "pages.app_observable.abnormal_events", + "pannelUrl": "/grafana/d/HtuWUeSSz/nginx-event" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/app_observable/v1/ntopo.json b/sysom_web/public/resource/app_observable/v1/ntopo.json new file mode 100644 index 0000000000000000000000000000000000000000..a32715844c5946adad8ae3e2a41286207268dabf --- /dev/null +++ b/sysom_web/public/resource/app_observable/v1/ntopo.json @@ -0,0 +1,9 @@ +{ + "menuName": "menu.app_observable.ntopo", + "type": "singleGrafanaPannel", + "config": { + "pannelId": "ntopo", + "pannelName": "", + "pannelUrl": "/grafana/d/H04tHN34k/ntopo" + } +} \ No newline at end of file diff --git a/sysom_web/public/resource/app_observable/v1/process_app.json b/sysom_web/public/resource/app_observable/v1/process_app.json new file mode 100644 index 0000000000000000000000000000000000000000..c9fd381596c2bfd7162b7e49c56aadabef593345 --- /dev/null +++ b/sysom_web/public/resource/app_observable/v1/process_app.json @@ -0,0 +1,9 @@ +{ + "menuName": "menu.app_observable.process_app", + "type": "singleGrafanaPannel", + "config": { + "pannelId": "process_app", + "pannelName": "", + "pannelUrl": "/grafana/d/FP_k0bqVz/process_app" + } +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v1/link/procdiag.json b/sysom_web/public/resource/diagnose/v1/link/procdiag.json new file mode 100644 index 0000000000000000000000000000000000000000..8d39c9be0a903d668df3fb2173e61ecc169804f6 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v1/link/procdiag.json @@ -0,0 +1,38 @@ +{ + "servicename": "procdiag", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "ipport", + "initialValue": "", + "label": "IP:Port", + "tooltips": "目标IP:端口" + }, + { + "type": "text", + "name": "time", + "initialValue": "60", + "label": "诊断时长", + "tooltips": "输入诊断时间长度,单位秒" + } + + ], + "variables": [], + "pannels": [ + { + "key": "progdiag_data", + "type": "markdown", + "title": "应用诊断", + "datasource": "procdiag_data" + } + ] +} + diff --git a/sysom_web/public/resource/diagnose/v1/locales.json b/sysom_web/public/resource/diagnose/v1/locales.json index ffe97d45c6ed3d326fa0dde78df094cfd3b23906..c66977bcfd0422f64c2e83dd03ff90f49d8d8241 100644 --- a/sysom_web/public/resource/diagnose/v1/locales.json +++ b/sysom_web/public/resource/diagnose/v1/locales.json @@ -18,6 +18,7 @@ "menu.diagnose.storage.iolatency": "IO时延分析", "menu.diagnose.storage.iofsstat": "IO流量分析", "menu.diagnose.storage.iohang": "IO HANG诊断", + "menu.diagnose.storage.iodiagnose": "IO 一键诊断", "menu.diagnose.custom.command": "命令诊断", "menu.diagnose.custom.rca": "指标异常分析", "menu.diagnose.net.packetdrop": "丢包诊断", @@ -25,6 +26,7 @@ "menu.diagnose.net.retran": "重传诊断", "menu.diagnose.net.pingtrace": "时延诊断", "menu.diagnose.link.rtdelay": "RT时延分析", - "menu.diagnose.link.jruntime": "java运行时分析" + "menu.diagnose.link.jruntime": "java运行时分析", + "menu.diagnose.link.procdiag": "应用抖动诊断" } } diff --git a/sysom_web/public/resource/diagnose/v1/storage/iodiagnose.json b/sysom_web/public/resource/diagnose/v1/storage/iodiagnose.json new file mode 100644 index 0000000000000000000000000000000000000000..3c43d716038546cf2bf91325e8a6c31a2a6bbbdd --- /dev/null +++ b/sysom_web/public/resource/diagnose/v1/storage/iodiagnose.json @@ -0,0 +1,28 @@ +{ + "servicename": "iodiagnose", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起IO诊断" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "30", + "label": "诊断时长", + "tooltips": "诊断时长,也是IO诊断统计周期,单位秒,不建议低于20秒" + } + ], + "pannels": [ + { + "key": "overview", + "type": "table", + "title": "diagnose result", + "datasource": "overview" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v1/storage/iolatency.json b/sysom_web/public/resource/diagnose/v1/storage/iolatency.json index df3d060fc68a8525953e7e4ed3661e84e108fac8..2e578b57887ae8e2dd7f849df395956472fd6ee7 100644 --- a/sysom_web/public/resource/diagnose/v1/storage/iolatency.json +++ b/sysom_web/public/resource/diagnose/v1/storage/iolatency.json @@ -55,8 +55,8 @@ "thresholds": { "mode": "absolute", "steps": [ - { "color": "red", "value": 1 }, - { "color": "green", "value": 0 } + { "color": "red", "value": 0 }, + { "color": "green", "value": -1 } ] } } diff --git a/sysom_web/public/resource/diagnose/v2/colocation/cpi.json b/sysom_web/public/resource/diagnose/v2/colocation/cpi.json new file mode 100644 index 0000000000000000000000000000000000000000..f2b1acd79fe4e2b38add069f4311b66553e30ba8 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/colocation/cpi.json @@ -0,0 +1,43 @@ +{ + "servicename": "colocation_cpi", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将对这台机器的发起历史现场诊断" + }, + { + "type": "text", + "name": "moment", + "initialValue": "", + "label": "时刻", + "tooltips": "我们将对该时刻的前5分钟内的混部CPI干扰进行诊断,不填写则对当前时刻进行诊断。时间字符串格式参考 '2024-01-01 00:00:00'" + } + ], + "variables": [], + "pannels": [ + { + "key": "overview", + "type": "markdown", + "title": "诊断结果汇总", + "datasource": "overview" + }, + { + "key": "container-table", + "type": "table", + "title": "容器信息", + "datasource": "container-table", + "tooltips": "容器维度干扰信息汇总 用于检查容器的受损程度以及受损原因" + }, + { + "key": "disturb-timeseries", + "title": "时序信息", + "tooltips": "时序维度干扰信息汇总 用于对LLC以及内存带宽资源与干扰次数进行关联分析", + "datasource": "disturb-timeseries", + "type": "timeseries" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/colocation/serveutil.json b/sysom_web/public/resource/diagnose/v2/colocation/serveutil.json new file mode 100644 index 0000000000000000000000000000000000000000..6b2d4785129ad3d442edba40d9f61ee9535d3e9e --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/colocation/serveutil.json @@ -0,0 +1,36 @@ +{ + "servicename": "colocation_serveutil", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将对这台机器发起历史现场诊断" + }, + { + "type": "text", + "name": "moment", + "initialValue": "", + "label": "时刻", + "tooltips": "我们将对该时刻满足率干扰进行诊断,不填写则对当前时刻进行诊断。时间字符串格式为参考 '2024-01-01 00:00:00'" + } + ], + "variables": [], + "pannels": [ + { + "key": "overview", + "type": "markdown", + "title": "诊断结果汇总", + "datasource": "overview" + }, + { + "key": "container-table", + "type": "table", + "title": "受干扰容器详情", + "datasource": "container-table", + "tooltips": "受干扰容器详情" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/cpu/cpuhigh.json b/sysom_web/public/resource/diagnose/v2/cpu/cpuhigh.json new file mode 100644 index 0000000000000000000000000000000000000000..d0d4b4031f57cf3f8bffab115f6b8d1850e48093 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/cpu/cpuhigh.json @@ -0,0 +1,35 @@ +{ + "servicename": "cpuhigh", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "moment", + "initialValue": "2023-08-12 00:00:00", + "label": "时间", + "tooltips": "CPU冲高发生的时间点" + } + ], + "variables": [], + "pannels": [ + { + "key": "cpuhighEvent", + "type": "stat", + "title": "Event overview", + "datasource": "cpuhighEventSummary" + }, + { + "key": "cpuhighTable", + "type": "table", + "title": "cpu high Details", + "datasource": "cpuhighTable" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/cpu/loadtask.json b/sysom_web/public/resource/diagnose/v2/cpu/loadtask.json new file mode 100644 index 0000000000000000000000000000000000000000..7d4fd02f8597fb36088a0a55a71f0bcb1b13b9d2 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/cpu/loadtask.json @@ -0,0 +1,86 @@ +{ + "servicename": "loadtask", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + } + ], + "variables": [], + "pannels": [ + { + "key": "事件总览", + "type": "stat", + "title": "事件总览", + "datasource": "dataresult", + "fieldConfig": { + "mappings": [ + { + "type": "value", + "options": { + "true": { + "color": "red", + "text": "异常" + }, + "false": { + "color": "green", + "text": "正常" + } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 20 + }, + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "" + } + }, + { + "key": "pieRow", + "type": "row", + "title": "测试行", + "datasource": "", + "children": [ + { + "key": "alltasks", + "type": "piechart", + "title": "R/D状态进程数量", + "datasource": "datataskcount" + }, + { + "key": "dtasks", + "type": "piechart", + "title": "D状态负载影响度", + "datasource": "datauninterruptload" + }, + { + "key": "rtasks", + "type": "piechart", + "title": "R状态负载影响度", + "datasource": "datarunningload" + } + ] + }, + { + "key": "火焰图", + "type": "svg", + "title": "调度火焰图", + "datasource": "dataflamegraph" + } + ] + +} diff --git a/sysom_web/public/resource/diagnose/v2/cpu/readme b/sysom_web/public/resource/diagnose/v2/cpu/readme new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_web/public/resource/diagnose/v2/cpu/schedmoni.json b/sysom_web/public/resource/diagnose/v2/cpu/schedmoni.json new file mode 100644 index 0000000000000000000000000000000000000000..9ea76e476ee4468729fc74b93dcd93183d3ba301 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/cpu/schedmoni.json @@ -0,0 +1,49 @@ +{ + "servicename": "schedmoni", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "20", + "label": "诊断时间", + "tooltips": "本次的期望的诊断时间,默认20秒" + }, + { + "type": "text", + "name": "threshold", + "initialValue": "20", + "label": "诊断阈值", + "tooltips": "延迟的阈值,超过这个阈值就记录,默认20ms" + } + ], + "variables": [], + "pannels": [ + { + "key": "jitterEvent", + "type": "stat", + "title": "Event overview", + "datasource": "jitterEventSummary" + }, + { + "key": "timeseriesTable", + "type": "timeseries", + "title": "Timeline Diagram", + "datasource": "jitterTimeSeries" + }, + { + "key": "jitterTable", + "type": "table", + "title": "Scheduling Jitter Details", + "datasource": "jitterTable" + } + ] + +} diff --git a/sysom_web/public/resource/diagnose/v2/cpu/taskprofile.json b/sysom_web/public/resource/diagnose/v2/cpu/taskprofile.json new file mode 100644 index 0000000000000000000000000000000000000000..48c259cf803d8464f2d29c662bd169f02adf38e6 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/cpu/taskprofile.json @@ -0,0 +1,29 @@ +{ + "servicename": "taskprofile", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起应用占用cpu统计" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "5", + "label": "诊断时长", + "tooltips": "诊断时长,也是各应用占用cpu统计周期,单位分,建议不超过10分钟" + } + ], + "variables": [], + "pannels": [ + { + "key": "appProfile", + "type": "table", + "title": "App profile result", + "datasource": "appProfile" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/custom/rca.json b/sysom_web/public/resource/diagnose/v2/custom/rca.json new file mode 100644 index 0000000000000000000000000000000000000000..96560aef0b4def28601633f7da0c5e2dc6cfd3e6 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/custom/rca.json @@ -0,0 +1,37 @@ +{ + "servicename": "rca", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "base_item", + "initialValue": "test_RT", + "label": "异常指标", + "tooltips": "异常指标的名字,如用户RT" + }, + { + "type": "text", + "name": "time", + "initialValue": "", + "label": "异常时刻", + "tooltips": "请输入异常时间点,格式:2023-07-08 12:23:33" + } + ], + "variables": [], + "pannels": [ + { + "key": "rca_result", + "type": "markdown", + "title": "", + "datasource": "RcaResult" + } + ] +} + diff --git a/sysom_web/public/resource/diagnose/v2/link/jruntime.json b/sysom_web/public/resource/diagnose/v2/link/jruntime.json new file mode 100644 index 0000000000000000000000000000000000000000..67f1b3ef3470819a07fa1c99abbf4260d1cb76c2 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/link/jruntime.json @@ -0,0 +1,61 @@ +{ + "servicename": "jruntime", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "nums", + "initialValue": "3", + "label": "进程数量", + "tooltips": "请输入你需要诊断的进程数量," + }, + { + "type": "text", + "name": "pids", + "initialValue": "", + "label": "进程列表", + "tooltips": "进程列表,要用逗号隔开,进程数量和进程列表仅有一个参数有效" + }, + { + "type": "select", + "name": "global", + "initialValue": "on", + "label": "全局热点", + "tooltips": "是否需要全局热点", + "options": [ + { + "value": "on", + "label": "开启" + }, + { + "value": "off", + "label": "关闭" + } + ] + } + + ], + "variables": [], + "pannels": [ + { + "key": "jruntime_data", + "type": "markdown", + "title": "", + "datasource": "jruntime_data" + }, + { + "key": "jruntime_set", + "type": "svg", + "title": "运行时诊断结果", + "datasource": "svgdata" + } + ] +} + diff --git a/sysom_web/public/resource/diagnose/v2/link/procdiag.json b/sysom_web/public/resource/diagnose/v2/link/procdiag.json new file mode 100644 index 0000000000000000000000000000000000000000..8d39c9be0a903d668df3fb2173e61ecc169804f6 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/link/procdiag.json @@ -0,0 +1,38 @@ +{ + "servicename": "procdiag", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "ipport", + "initialValue": "", + "label": "IP:Port", + "tooltips": "目标IP:端口" + }, + { + "type": "text", + "name": "time", + "initialValue": "60", + "label": "诊断时长", + "tooltips": "输入诊断时间长度,单位秒" + } + + ], + "variables": [], + "pannels": [ + { + "key": "progdiag_data", + "type": "markdown", + "title": "应用诊断", + "datasource": "procdiag_data" + } + ] +} + diff --git a/sysom_web/public/resource/diagnose/v2/link/rtdelay.json b/sysom_web/public/resource/diagnose/v2/link/rtdelay.json new file mode 100644 index 0000000000000000000000000000000000000000..1ab6b88e71d2b93d734ff41c9dee936cdaa1f0ee --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/link/rtdelay.json @@ -0,0 +1,46 @@ +{ + "servicename": "rtdelay", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "pid", + "initialValue": "", + "label": "应用进程号", + "tooltips": "请输入你需要诊断的应用进程号" + }, + { + "type": "text", + "name": "server_pid", + "initialValue": "-1", + "label": "服务端进程号", + "tooltips": "请输入该应用访问的服务端进程号(-1为不跟踪服务端)" + }, + { + "type": "text", + "name": "time", + "initialValue": "30", + "label": "诊断时长", + "tooltips": "请输入你需要诊断时长,单位秒" + } + ], + "variables": [], + "pannels": [ + { + "key": "rt_request_set", + "type": "table", + "title": "请求RT时延分析", + "datasource": "request_set", + "tooltips": "时间单位为微秒" + } + ] +} + + diff --git a/sysom_web/public/resource/diagnose/v2/locales.json b/sysom_web/public/resource/diagnose/v2/locales.json new file mode 100644 index 0000000000000000000000000000000000000000..9d01653b9029e1891eed4e1240eaee22373ee896 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/locales.json @@ -0,0 +1,100 @@ +{ + "version": 1.0, + "menus": [ + "menu.diagnose.ossre", + "menu.diagnose.memory.memgraph", + "menu.diagnose.memory.filecache", + "menu.diagnose.memory.oomcheck", + "menu.diagnose.memory.clustermem", + "menu.diagnose.cpu.schedmoni", + "menu.diagnose.cpu.loadtask", + "menu.diagnose.storage.iolatency", + "menu.diagnose.storage.iofsstat", + "menu.diagnose.storage.iohang", + "menu.diagnose.storage.iodiagnose", + "menu.diagnose.net.packetdrop", + "menu.diagnose.net.jitter", + "menu.diagnose.net.retran", + "menu.diagnose.net.pingtrace", + "menu.diagnose.link.rtdelay", + "menu.diagnose.link.procdiag", + "menu.diagnose.link.jruntime", + "menu.diagnose.custom.rca", + "menu.diagnose.colocation.cpi", + "menu.diagnose.colocation.serveutil" + ], + "locales": { + "zh-CN": { + "menu.diagnose.ossre":"系统健康检查", + "menu.diagnose.memory": "内存诊断中心", + "menu.diagnose.storage": "存储诊断中心", + "menu.diagnose.net": "网络诊断中心", + "menu.diagnose.cpu": "调度诊断中心", + "menu.diagnose.link": "链路诊断中心", + "menu.diagnose.custom": "自定义诊断中心", + "menu.diagnose.colocation" : "混部诊断中心", + "menu.diagnose.memory.memgraph": "内存大盘", + "menu.diagnose.memory.filecache": "Cache分析", + "menu.diagnose.memory.oomcheck": "OOM诊断", + "menu.diagnose.memory.clustermem": "内存异常诊断", + "menu.diagnose.cpu.schedmoni": "调度抖动诊断", + "menu.diagnose.cpu.loadtask": "系统负载诊断", + "menu.diagnose.storage.iolatency": "IO时延分析", + "menu.diagnose.storage.iofsstat": "IO流量分析", + "menu.diagnose.storage.iohang": "IO HANG诊断", + "menu.diagnose.storage.iodiagnose": "IO 一键诊断", + "menu.diagnose.net.packetdrop": "丢包诊断", + "menu.diagnose.net.jitter": "抖动诊断", + "menu.diagnose.net.retran": "重传诊断", + "menu.diagnose.net.pingtrace": "时延诊断", + "menu.diagnose.link.rtdelay": "RT时延分析", + "menu.diagnose.link.jruntime": "java运行时分析", + "menu.diagnose.link.procdiag": "应用抖动诊断", + "menu.diagnose.custom.rca": "指标异常分析", + "menu.diagnose.colocation.cpi": "混部CPI干扰诊断", + "menu.diagnose.colocation.serveutil": "混部CFS满足率干扰诊断", + + "pages.diagnose.ossre.ossre_result.inspect_items": "检查项目", + "pages.diagnose.ossre.ossre_result.status": "状态", + "pages.diagnose.ossre.ossre_result.abnormal_count": "异常数", + "pages.diagnose.ossre.ossre_result.inspect_result": "检查结果", + "pages.diagnose.ossre.ossre_result.options": "操作" + }, + "en-US": { + "menu.diagnose.ossre":"System Diagnosis", + "menu.diagnose.memory": "Memory Diagnosis Center", + "menu.diagnose.storage": "Storage Diagnosis Center", + "menu.diagnose.net": "Network Diagnosis Center", + "menu.diagnose.cpu": "Scheduling Diagnosis Center", + "menu.diagnose.link": "Link Diagnosis Center", + "menu.diagnose.custom": "Custom Diagnosis Center", + "menu.diagnose.colocation": "Colocation Diagnosis Center", + "menu.diagnose.memory.memgraph": "Memory Graph", + "menu.diagnose.memory.filecache": "Cache Analysis", + "menu.diagnose.memory.oomcheck": "OOM Diagnosis", + "menu.diagnose.memory.clustermem": "Memory Exception Diagnosis", + "menu.diagnose.cpu.schedmoni": "Scheduling Jitter Diagnosis", + "menu.diagnose.cpu.loadtask": "Loadtask Diagnosis", + "menu.diagnose.storage.iolatency": "IO Latency Analysis", + "menu.diagnose.storage.iofsstat": "IO Traffic Analysis", + "menu.diagnose.storage.iohang": "IO HANG Diagnosis", + "menu.diagnose.storage.iodiagnose": "IO One-Click Diagnosis", + "menu.diagnose.net.packetdrop": "Packet Loss Diagnosis", + "menu.diagnose.net.jitter": "Jitter Diagnosis", + "menu.diagnose.net.retran": "Retransmission Diagnosis", + "menu.diagnose.net.pingtrace": "Latency Diagnosis", + "menu.diagnose.link.rtdelay": "RT Latency Analysis", + "menu.diagnose.link.jruntime": "Java Runtime Analysis", + "menu.diagnose.link.procdiag": "Application jitter Diagnosis", + "menu.diagnose.custom.rca": "Metric Exception Analysis", + "menu.diagnose.colocation.cpi": "Colocation CPI Disturb Diagnosis", + "menu.diagnose.colocation.serveutil": "Colocation CFS Satisfication Disturb Diagnosis", + + "pages.diagnose.ossre.ossre_result.inspect_items": "Inspect Items", + "pages.diagnose.ossre.ossre_result.status": "Status", + "pages.diagnose.ossre.ossre_result.abnormal_count": "Abnormal Count", + "pages.diagnose.ossre.ossre_result.inspect_result": "Inspect Result", + "pages.diagnose.ossre.ossre_result.options": "Options" + } + } +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/memory/clustermem.json b/sysom_web/public/resource/diagnose/v2/memory/clustermem.json new file mode 100644 index 0000000000000000000000000000000000000000..a2cd7d94d0a616619ca845c13e1f0b299100dab9 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/memory/clustermem.json @@ -0,0 +1,64 @@ +{ + "servicename": "clustermem", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将会发起节点内存异常诊断" + }, + { + "type": "text", + "name": "pod_name", + "initialValue": "", + "label": "Pod", + "tooltips": "请输入你要诊断的pod名字,我们将会发起pod内存异常诊断" + }, + { + "type": "text", + "name": "time", + "initialValue": "", + "label": "异常时刻", + "tooltips": "请输入异常时间点,格式:2023-07-08 12:23:33" + }, + { + "type": "text", + "name": "diagnosis_type", + "initialValue": "", + "label": "诊断类型", + "tooltips": "请输入诊断类型,内存高诊断/内存延时诊断" + } + ], + "variables": [], + "pannels": [ + { + "key": "clustermem", + "type": "markdown", + "title": "诊断结果", + "datasource": "ClustermemResult" + }, + { + "key": "mem_usage", + "type": "stat", + "title": "节点/Pod已用内存构成", + "datasource": "UsageResult", + "fieldConfig": { + "unit": "%" + } + }, + { + "key": "podmem", + "type": "table", + "title": "缓存排序", + "datasource": "podmem", + "fieldConfig": { + "unit": "KB" + }, + "tableConfig": { + "enableSortColumn": ["文件Cached大小"] + } + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/memory/filecache.json b/sysom_web/public/resource/diagnose/v2/memory/filecache.json new file mode 100644 index 0000000000000000000000000000000000000000..6462917dfd513d11a9569abb456d6e1fc3b23216 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/memory/filecache.json @@ -0,0 +1,41 @@ +{ + "servicename": "filecache", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "value", + "initialValue": "", + "label": "容器", + "tooltips": "请输入你要诊断的容器ID,Pod名,cgroup" + }, + { + "type": "select", + "name": "type", + "initialValue": "all", + "label": "诊断类型", + "tooltips": "请输入需要诊断的类型(容器,POD,cgroup, host, all(所有容器))", + "options":[{"value":"pod", "label":"pod"}, {"value":"container", "label":"container"},{"value":"cgroup", "label":"cgroup"}, {"value":"host", "label":"host"},{"value":"all", "label":"all"}] + } + ], + "variables": [], + "pannels": [ + { + "key": "podmem", + "type": "table", + "title": "缓存排序", + "datasource": "podmem", + "fieldConfig": { + "unit": "KB" + } + } + ] + +} diff --git a/sysom_web/public/resource/diagnose/v2/memory/memgraph.json b/sysom_web/public/resource/diagnose/v2/memory/memgraph.json new file mode 100644 index 0000000000000000000000000000000000000000..83a28d31edc2686dbce233957f29dd58de495986 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/memory/memgraph.json @@ -0,0 +1,95 @@ +{ + "servicename": "memgraph", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + } + ], + "variables": [], + "pannels": [ + { + "key": "memEvent", + "type": "stat", + "title": "MemEvent", + "datasource": "dataMemEvent", + "fieldConfig": { + "mappings": [ + { + "type": "value", + "options": { + "OK": { + "color": "green", + "index": 0 + }, + "NG": { + "color": "red", + "index": 1 + } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 5 + }, + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "%" + } + }, + { + "key": "pieRow", + "type": "row", + "title": "测试行", + "datasource": "", + "children": [ + { + "key": "memOverView", + "type": "piechart", + "title": "MemOverView", + "datasource": "dataMemOverView" + }, + { + "key": "kerMem", + "type": "piechart", + "title": "KernelMem", + "datasource": "dataKerMem" + }, + { + "key": "userMem", + "type": "piechart", + "title": "UserMem", + "datasource": "dataUserMem" + } + ] + }, + { + "key": "procMemList", + "type": "table", + "title": "TaskTop", + "datasource": "dataProcMemList" + }, + { + "key": "cacheList", + "type": "table", + "title": "CacheTop", + "datasource": "dataCacheList", + "fieldConfig": { + "unit": "KB" + } + } + ] + +} diff --git a/sysom_web/public/resource/diagnose/v2/memory/oomcheck.json b/sysom_web/public/resource/diagnose/v2/memory/oomcheck.json new file mode 100644 index 0000000000000000000000000000000000000000..3ba2c89d336e1c45cb72cb4c583b9588b6a50001 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/memory/oomcheck.json @@ -0,0 +1,54 @@ +{ + "servicename": "oomcheck", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "time", + "initialValue": "", + "label": "诊断时间", + "tooltips": "请输入需要诊断OOM的时间点,默认为最近一次" + } + ], + "variables": [], + "pannels": [ + { + "key": "oomAnalysis", + "type": "stat", + "title": "OOM Analysis", + "datasource": "oomAnalysis" + }, + { + "key": "oomEvent", + "type": "stat", + "title": "OOM Diagnose Result", + "datasource": "oomResult" + }, + { + "key": "oomDetail", + "type": "table", + "title": "OOM Diagnose Detail", + "datasource": "oomDetail", + "fieldConfig": { + "unit": "KB" + } + }, + { + "key": "oomTask", + "type": "table", + "title": "OOM Tasks Detail", + "datasource": "oomTask", + "fieldConfig": { + "unit": "KB" + } + } + ] + +} diff --git a/sysom_web/public/resource/diagnose/v2/multichannel.json b/sysom_web/public/resource/diagnose/v2/multichannel.json new file mode 100644 index 0000000000000000000000000000000000000000..e30d171b977ccf3f4e60d36a2b775262ccf7945e --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/multichannel.json @@ -0,0 +1,69 @@ +{ + "version": 1.0, + "channels": [ + { + "name": "ssh", + "label": "SSH通道", + "extra_params": { + "*": [ + { + "type": "text", + "name": "channel", + "initialValue": "ssh", + "label": "通道类型", + "disabled": true + } + ] + } + }, + { + "name": "offline", + "label": "离线通道", + "extra_params": { + "*": [ + { + "type": "text", + "name": "channel", + "initialValue": "offline", + "label": "离线通道", + "disabled": true + }, + { + "type": "text", + "name": "sysom_preprocess_post_wrapper", + "initialValue": "dummy", + "label": "命令包装器", + "disabled": false + } + ] + }, + "override_params": { + "*": [ + { + "type": "text", + "name": "instance", + "initialValue": "anonymous", + "label": "实例", + "tooltips": "请输入你要诊断的实例ID" + } + ], + "pingtrace": [ + { + "type": "text", + "name": "origin_instance", + "initialValue": "anonymous", + "label": "源实例ID", + "tooltips": "请输入你要诊断的源实例ID, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "target_instance", + "initialValue": "anonymous", + "label": "目标实例ID", + "tooltips": "请输入你要诊断的目标实例ID, 我们将在会这台机器内部发起SysAK诊断" + } + ] + } + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/net/jitter.json b/sysom_web/public/resource/diagnose/v2/net/jitter.json new file mode 100644 index 0000000000000000000000000000000000000000..e0fe9503cf6ebca7a56c0f69b0aa3c5172a20754 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/net/jitter.json @@ -0,0 +1,93 @@ +{ + "servicename": "jitter", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "time", + "initialValue": "10", + "label": "运行时间", + "tooltips": "请输入你要诊断的时间,单位秒" + } + ], + "variables": [], + "pannels": [ + { + "key": "flowchart", + "type": "flow", + "title": "发送端报文路径", + "flowconfigs": { + "nodes": [ + { + "id": "send", + "x": "40", + "y": "40" + }, + { + "id": "out", + "x": "340", + "y": "40" + }, + { + "id" : "recv", + "x": "640", + "y": "40" + } + ], + "edges": [ + { + "source": "send", + "target": { + "cell": "out", + "port": "right" + } + }, + { + "source": "out", + "target": { + "cell": "recv", + "port": "right" + } + } + ] + }, + "datasource": "senderflow" + }, + { + "key": "flowchart", + "type": "flow", + "title": "接收端报文路径", + "flowconfigs": { + "nodes": [ + { + "id": "recv", + "x": "40", + "y": "40" + }, + { + "id" : "send", + "x": "40", + "y": "200" + } + ], + "edges": [ + { + "source": "recv", + "target": { + "cell": "send", + "port": "top" + } + } + ] + }, + "datasource": "receiverflow" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/net/packetdrop.json b/sysom_web/public/resource/diagnose/v2/net/packetdrop.json new file mode 100644 index 0000000000000000000000000000000000000000..eaac1155a5e71e15287013e7cac94c9e30500289 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/net/packetdrop.json @@ -0,0 +1,41 @@ +{ + "servicename": "packetdrop", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "time", + "initialValue": "10", + "label": "运行时间", + "tooltips": "请输入你要诊断的时间,单位秒" + } + ], + "variables": [], + "pannels": [ + { + "key": "packetDropEvent", + "type": "stat", + "title": "packetDrop overview", + "datasource": "packetDropSummary" + }, + { + "key": "packetDropAnalysis", + "type": "table", + "title": "丢包根因分析", + "datasource": "packetDropAnalysis" + }, + { + "key": "packetDropList", + "type": "table", + "title": "丢包详情列表", + "datasource": "packetDropList" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/net/pingtrace.json b/sysom_web/public/resource/diagnose/v2/net/pingtrace.json new file mode 100644 index 0000000000000000000000000000000000000000..d1336128bc9e4f361924af568b6a6684a4387924 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/net/pingtrace.json @@ -0,0 +1,138 @@ +{ + "servicename": "pingtrace", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "origin_instance", + "initialValue": "", + "label": "源实例ip", + "tooltips": "请输入你要诊断的源实例IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "target_instance", + "initialValue": "", + "label": "目标实例ip", + "tooltips": "请输入你要诊断的目标实例IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "pkg_num", + "initialValue": "100", + "label": "追踪包数", + "tooltips": "请输入你要追踪的包数" + }, + { + "type": "text", + "name": "time_gap", + "initialValue": "1000", + "label": "间隔毫秒数", + "tooltips": "请输入发包间隔" + }, + { + "type": "select", + "name": "type", + "initialValue": "icmp", + "label": "报文协议", + "tooltips": "请输入报文协议(icmp,tcp,udp)", + "options":[{"value":"icmp", "label":"icmp"}, {"value":"tcp", "label":"tcp"},{"value":"udp", "label":"udp"}] + } + ], + "variables": [], + "pannels": [ + { + "key": "pingtraceLatency", + "type": "flow", + "title": "Overall delay distribution", + "flowconfigs": { + "layout":{ + "type": "grid", + "rows": "2", + "cols": "4" + }, + "nodes": [ + { + "id": "l_tx_kern", + "tips": "The time delay of the transmitting path in client OS kernel(unit: us)" , + "col": 0, + "row": 0 + }, + { + "id": "l_tx_qdisc", + "tips":"The time delay of client OS kernel qdisc(unit: us)", + "col": 1, + "row": 0 + }, + { + "id": "l_tx_outlink", + "tips": "The time delay of outter link(unit: us)", + "col": 2, + "row": 0 + }, + { + "id": "r_tx_kern", + "tips": "The time delay of the transmitting path in server OS kernel(unit: us)", + "col": 3, + "row": 0 + }, + { + "id": "l_rx_inlink", + "tips": "The time delay of inner link(unit: us)", + "col": 3, + "row": 1 + }, + { + "id": "l_rx_kern", + "tips": "The time delay of the receiving path in client OS kernel(unit: us)", + "col": 2, + "row": 1 + }, + { + "id": "l_rx_task_waking", + "tips": "The time delay of pingtrace task wakeup(unit: us)", + "col": 1, + "row": 1 + }, + { + "id": "l_rx_task_queue", + "tips": "The time delay of pingtrace task queue(unit: us)", + "col": 0, + "row": 1 + } + ], + "edges": [ + { + "source": "l_tx_kern", + "target": "l_tx_qdisc" + }, + { + "source": "l_tx_qdisc", + "target": "l_tx_outlink" + }, + { + "source": "l_tx_outlink", + "target": "r_tx_kern" + }, + { + "source": "r_tx_kern", + "target": "l_rx_inlink" + }, + { + "source": "l_rx_inlink", + "target": "l_rx_kern" + }, + { + "source": "l_rx_kern", + "target": "l_rx_task_waking" + }, + { + "source": "l_rx_task_waking", + "target": "l_rx_task_queue" + } + ] + }, + "datasource": "pingtraceFlow" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/net/readme b/sysom_web/public/resource/diagnose/v2/net/readme new file mode 100644 index 0000000000000000000000000000000000000000..1080d80ef778b573d90b79c85df19015ce10a322 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/net/readme @@ -0,0 +1 @@ +请在此文件夹添加网络相关的面板配置文件。 diff --git a/sysom_web/public/resource/diagnose/v2/net/retran.json b/sysom_web/public/resource/diagnose/v2/net/retran.json new file mode 100644 index 0000000000000000000000000000000000000000..31206f18bf54a6b1b3d86429e666fe3d1d4042ab --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/net/retran.json @@ -0,0 +1,75 @@ +{ + "servicename": "retran", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP, 我们将在会这台机器内部发起SysAK诊断" + }, + { + "type": "text", + "name": "time", + "initialValue": "10", + "label": "运行时间", + "tooltips": "请输入你要诊断的时间,单位秒" + } + ], + "variables": [], + "pannels": [ + { + "key": "retranSummary", + "type": "stat", + "title": "retran overview", + "datasource": "retranSummary" + }, + { + "key": "portDistribution", + "type": "row", + "title": "port distribution", + "datasource": "portDistribution", + "children": [ + { + "key": "sportDistribution", + "type": "piechart", + "title": "sourcePortDistribution", + "datasource": "sourcePortDistribution" + }, + { + "key": "destPortDistribution", + "type": "piechart", + "title": "destPortDistribution", + "datasource": "destPortDistribution" + } + ] + }, + { + "key": "ipDistribution", + "type": "row", + "title": "ip distribution", + "datasource": "ipDistribution", + "children": [ + { + "key": "sourceIpDistribution", + "type": "piechart", + "title": "sourceIpDistribution", + "datasource": "sourceIpDistribution" + }, + { + "key": "destIpDistribution", + "type": "piechart", + "title": "destIpDistribution", + "datasource": "destIpDistribution" + } + ] + }, + { + "key": "retranList", + "type": "table", + "title": "重传详情列表", + "datasource": "retranList" + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/ossre.json b/sysom_web/public/resource/diagnose/v2/ossre.json new file mode 100644 index 0000000000000000000000000000000000000000..978352bb3a989b5e713200d1bd4d9eb1221b844c --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/ossre.json @@ -0,0 +1,115 @@ +{ + "servicename": "ossre", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + } + ], + "variables": [], + "pannels": [ + { + "key": "pieRow", + "type": "row", + "title": "总体检查情况", + "datasource": "", + "children": [ + { + "key": "hostInfo", + "type": "markdown", + "title": "系统信息", + "datasource": "hostInfo" + }, + { + "key": "checkItems", + "type": "piechart", + "title": "检查项", + "datasource": "checkItems" + } + ] + }, + { + "key": "ossre_result", + "type": "table", + "title": "检查列表", + "datasource": "OssreResult", + "columns": [ + { + "key": "inspect_items", + "title": "pages.diagnose.ossre.ossre_result.inspect_items", + "valueType": "text", + "width": 200 + }, + { + "key": "status", + "title": "pages.diagnose.ossre.ossre_result.status", + "valueEnum": { + "normal": { + "text": "正常", + "status": "Success" + }, + "warning": { + "text": "告警", + "status": "Warning" + }, + "error": { + "text": "异常", + "status": "Error" + }, + "critical": { + "text": "严重异常", + "status": "Error" + } + }, + "width": 100 + }, + { + "key": "abnormal_count", + "title": "pages.diagnose.ossre.ossre_result.abnormal_count", + "valueType": "digit", + "width": 100 + }, + { + "key": "inspect_result", + "title": "pages.diagnose.ossre.ossre_result.inspect_result", + "valueType": "text" + }, + { + "key": "options", + "title": "pages.diagnose.ossre.ossre_result.options", + "valueType": "custom_options", + "width": 200 + } + ], + "fieldConfig": { + "mappings": [ + { + "type": "value", + "options": { + "critical": { + "color": "#FF000066", + "text": "严重异常" + }, + "error": { + "color": "#FF663366", + "text": "异常" + }, + "warning": { + "color": "#FF990066", + "text": "告警" + }, + "normal": { + "color": "#00990066", + "text": "正常" + } + } + } + ] + } + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/rca/.gitkeep b/sysom_web/public/resource/diagnose/v2/rca/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sysom_web/public/resource/diagnose/v2/storage/iodiagnose.json b/sysom_web/public/resource/diagnose/v2/storage/iodiagnose.json new file mode 100644 index 0000000000000000000000000000000000000000..59ba88a6209bd84844b676ddd1f448c5b410184b --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/storage/iodiagnose.json @@ -0,0 +1,35 @@ +{ + "servicename": "iodiagnose", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起IO诊断" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "30", + "label": "诊断时长", + "tooltips": "诊断时长,也是IO诊断统计周期,单位秒,不建议低于30秒" + } + ], + "variables": [], + "pannels": [ + { + "key": "iodiagnoseOverview", + "type": "table", + "title": "diagnose overview", + "datasource": "iodiagnoseOverview" + }, + { + "key": "iodiagnoseDetail", + "type": "table", + "title": "diagnose result", + "datasource": "iodiagnoseDetail" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/storage/iofsstat.json b/sysom_web/public/resource/diagnose/v2/storage/iofsstat.json new file mode 100644 index 0000000000000000000000000000000000000000..f63b8000c7261059faa67af7d870c2371496feed --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/storage/iofsstat.json @@ -0,0 +1,60 @@ +{ + "servicename": "iofsstat", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起IO流量统计" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "15", + "label": "诊断时长", + "tooltips": "诊断时长,也是IO流量统计周期,单位秒,建议不超过60秒" + }, + { + "type": "text", + "name": "disk", + "initialValue": "", + "label": "目标磁盘", + "tooltips": "请输入你要诊断的磁盘,如vda,sda等等,缺省为所有磁盘" + } + ], + "variables": [ + { + "key": "disks", + "label": "磁盘", + "datasource": "disks" + } + ], + "pannels": [ + { + "key": "overview", + "type": "table", + "title": "diagnose result", + "datasource": "overview" + }, + { + "key": "diskIOstat", + "type": "table", + "title": "Disk IO traffic analysis", + "datasource": "diskIOstat_${disks}" + }, + { + "key": "taskIOstat", + "type": "table", + "title": "Process IO traffic analysis", + "datasource": "taskIOstat_${disks}" + }, + { + "key": "taskIOblocksize", + "type": "table", + "title": "Process IO Block Size Distribution", + "datasource": "taskIOblocksize_${disks}" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/storage/iohang.json b/sysom_web/public/resource/diagnose/v2/storage/iohang.json new file mode 100644 index 0000000000000000000000000000000000000000..cafec68a5db58e990516d0d11fad3d8c0ab0f3c5 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/storage/iohang.json @@ -0,0 +1,71 @@ +{ + "servicename": "iohang", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起IO HANG诊断" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "10", + "label": "诊断时长", + "tooltips": "请输入你要诊断的时长,单位秒" + }, + { + "type": "text", + "name": "threshold", + "initialValue": "5000", + "label": "时间阈值", + "tooltips": "保留IO HANG住时间超过阈值的IO,单位毫秒" + }, + { + "type": "text", + "name": "disk", + "initialValue": "", + "label": "目标磁盘", + "tooltips": "请输入你要诊断的磁盘,如vda,sda等等,缺省为所有磁盘" + } + ], + "variables": [ + { + "key": "disks", + "label": "磁盘", + "datasource": "disks" + } + ], + "pannels": [ + { + "key": "iohangOverview", + "type": "stat", + "title": "IO HANG overview", + "datasource": "iohangOverview_${disks}", + "fieldConfig": { + "mappings": [{ + "type": "value", + "options": { + "normal": { "color": "green" }, + "abnormal": { "color": "red" } + } + }], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 1 }, + { "color": "green", "value": 0 } + ] + } + } + }, + { + "key": "singleIO", + "type": "table", + "title": "More details of TOP 10 IO", + "datasource": "singleIO_${disks}" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/storage/iolatency.json b/sysom_web/public/resource/diagnose/v2/storage/iolatency.json new file mode 100644 index 0000000000000000000000000000000000000000..c4b6615ef41bd0142416fd3c0635d1d0be80da3d --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/storage/iolatency.json @@ -0,0 +1,128 @@ +{ + "servicename": "iolatency", + "version":1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP,我们将在会这台机器内部发起IO时延诊断" + }, + { + "type": "text", + "name": "timeout", + "initialValue": "10", + "label": "诊断时长", + "tooltips": "请输入你要诊断的时长,单位秒" + }, + { + "type": "text", + "name": "threshold", + "initialValue": "1000", + "label": "时间阈值", + "tooltips": "保留IO延迟大于设定时间阈值的IO(时间单位:ms)" + }, + { + "type": "text", + "name": "disk", + "initialValue": "", + "label": "目标磁盘", + "tooltips": "请输入你要诊断的磁盘,如vda,sda等等,缺省为所有磁盘" + } + ], + "variables": [ + { + "key": "disks", + "label": "磁盘", + "datasource": "disks" + } + ], + "pannels": [ + { + "key": "iolatencyOverview", + "type": "stat", + "title": "Iolatency overview", + "datasource": "iolatencyOverview_${disks}", + "fieldConfig": { + "mappings": [{ + "type": "value", + "options": { + "normal": { "color": "green" }, + "abnormal": { "color": "red" } + } + }], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": 0 }, + { "color": "green", "value": -1 } + ] + } + } + }, + { + "key": "iolatencyDistribution", + "type": "flow", + "title": "Overall delay distribution", + "flowconfigs": { + "layout":{ + "type": "linear" + }, + "nodes": [ + { + "id": "os(block)", + "tips": "The time delay of the general block layer in OS kernel(unit: us)" + }, + { + "id": "os(driver)", + "tips":"The time delay of OS kernel driver(unit: us)" + }, + { + "id": "disk", + "tips": "The time delay of disk(unit: us)" + }, + { + "id": "os(complete)", + "tips": "The time delay of io complete in OS kernel(unit: us)" + }, + { + "id": "os(done)", + "tips": "The time delay of io done in OS kernel(unit: us)" + } + ], + "edges": [ + { + "source": "os(block)", + "target": "os(driver)" + }, + { + "source": "os(driver)", + "target": "disk" + }, + { + "source": "disk", + "target": "os(complete)" + }, + { + "source": "os(complete)", + "target": "os(done)" + } + ] + }, + "datasource": "iolatencyDistribution_${disks}" + }, + { + "key": "singleIOMetrics", + "type": "timeseries", + "title": "Single IO delay metrics display", + "datasource": "singleIOMetrics_${disks}" + }, + { + "key": "singleIO", + "type": "table", + "title": "More details of TOP 10 IO", + "datasource": "singleIO_${disks}" + } + ] +} diff --git a/sysom_web/public/resource/diagnose/v2/storage/readme b/sysom_web/public/resource/diagnose/v2/storage/readme new file mode 100644 index 0000000000000000000000000000000000000000..e2b3bc2b67115d2eae966922c6036e4101e248f9 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/storage/readme @@ -0,0 +1 @@ +请在此文件夹添加IO相关的面板配置文件。 diff --git a/sysom_web/src/app.jsx b/sysom_web/src/app.jsx index 56fc9cac66a5c9039e7f94ea765659f7e1f0b68e..89a200b5c2c3a834f53041617a27083186c0eb72 100644 --- a/sysom_web/src/app.jsx +++ b/sysom_web/src/app.jsx @@ -10,9 +10,10 @@ const loginPath = '/user/login'; const noNeedLoginRoutes = [ "/user/login", "/diagnose/detail", + "/diagnose/custom/pannel", ] -const isNeedLogin = function(path) { +const isNeedLogin = function (path) { return !noNeedLoginRoutes.find(item => path.startsWith(item)) } @@ -107,6 +108,7 @@ export const layout = ({ initialState }) => { let extraGrafanaRoutes = []; let extraDiagnoseRoute = []; +let extraAppObserverRoute = []; // Saved menu_name -> service_name // @see menuName => config/routes.js // @see servierName => /api/v1/services/list @@ -114,7 +116,7 @@ let menuNameMapServiceName = { user: "sysom_api", welcome: "sysom_api", host: "sysom_api", - journal: "sysom_api", + journal: "sysom_log", monitor: "sysom_monitor_server", vmcore: "sysom_vmcore", diagnose: "sysom_diagnosis", @@ -122,144 +124,196 @@ let menuNameMapServiceName = { security: "sysom_vul", hotfix: "sysom_hotfix", alarm: "sysom_alarm", + app_observable: "sysom_monitor_server", } let enable_services = []; -function customizer(objValue, srcValue) { - if (_.isArray(objValue)) { - return objValue.concat(srcValue); +/** + * + * @param {*} routes + * @param {*} newRoute eg. /diagnose/cpu/schedmoni + */ +function addRoute(routes, newRoute, component) { + console.log(`Add route ${newRoute}`); + let rootPath = routes.find((item) => item.path == "/"); + // filter(Boolean) is used to remove the empty string at the start + const parts = newRoute.split("/").filter(Boolean); + const paths = parts.reduce((acc, part, index) => { + const prevPath = index > 0 ? acc[index - 1] : ''; + acc.push(prevPath + '/' + part); + return acc; + }, []); + + let currentParent = rootPath; + + // Add /diagnose, /diagnose/cpu if not exist + paths.forEach((path, index) => { + if (index < paths.length - 1) { + if (!_.keyBy(currentParent.routes, 'path')[path]) { + currentParent.routes.push({ + path: path, + name: parts[index], + routes: [], + routes: [{ + exact: true, + path: path, + redirect: paths[index + 1], + }], + }); + console.log("redirect", { + path: path, + redirect: paths[index + 1], + }) + } + currentParent = _.keyBy(currentParent.routes, 'path')[path]; + } + }); + + // Add /diagnose/cpu/schedmoni + if (!_.keyBy(currentParent.routes, 'path')[newRoute]) { + currentParent.routes.push({ + path: newRoute, + name: parts[parts.length - 1], + component: component + }); } } export function patchRoutes({ routes }) { - //Insert the grafana dashboard item to monitor memu. - routes.find((item) => item.path == "/").routes.find((item) => item.name == "monitor") + let rootPath = routes.find((item) => item.path == "/"); + + //Insert the grafana dashboard item to monitor memu. + rootPath.routes.find((item) => item.name == "monitor") .routes.splice(-1, 0, ...extraGrafanaRoutes) + // Add diagnose routes + extraDiagnoseRoute.forEach(item => { + addRoute(routes, item.path, item.component) + }); + + // Add app_observer routes + extraAppObserverRoute.forEach(item => { + addRoute(routes, item.path, item.component) + }); + + // Filter the menu by enable_services if (enable_services.length > 0) { - let rootPath = routes.find((item) => item.path == "/") - // for (let i = 0; i < rootPath.routes.length; i++) { - // if (!!rootPath.routes[i].name && !!menuNameMapServiceName[rootPath.routes[i].name]) { - // if (!enable_services.find(v => v == menuNameMapServiceName[rootPath.routes[i].name])) { - // rootPath.routes[i].disabled = true - // rootPath.routes[i].hideChildrenInMenu = true - // // delete rootPath.routes[i] - // } - // } - // // routes.routes = routes.routes.filter(item => !!item.name && enable_services.find(v => v == menuNameMapServiceName[item.name])) - // } rootPath.routes = rootPath.routes.filter(v => !menuNameMapServiceName[v.name] || enable_services.find(v2 => v2 == menuNameMapServiceName[v.name])) } +} - //Find the array of diagonse's children. ex: io, net, memory - let diagnose = routes.find((item) => item.path == "/") - .routes.find(item => item.name == "diagnose") +import grafanaDash from './pages/Monitor/grafana' +import diagnose_component from './pages/diagnose/diagnose'; +import appobserver_component from './pages/app_observable'; - if (!!diagnose) { - // Add forder - extraDiagnoseRoute.map(item => { - if (!_.keyBy(diagnose.routes, 'path')[item.path]) { - // Add forder if not exist - diagnose.routes = diagnose.routes.concat({ - ...item, - routes: [] +export function render(oldRender) { + requestURL("/api/v1/cmg/services/list") + .then((res) => { + if (res.code == 200) { + res.data.map((item) => { + if (item.count > 0) { enable_services.push(item.service_name) } }) } + localStorage.setItem("enableServices", enable_services) }) - - //Add The extraDiagnoseRoute in it. - diagnose.routes.map(item => { - const new_routes = _.keyBy(extraDiagnoseRoute, 'path')[item.path]?.routes - if (item.routes && new_routes) { - item.routes = item.routes.concat(new_routes); - - } - if (!item.routes && new_routes) { - item.routes = new_routes + .catch(err => { + message.error(err); + return Promise.resolve() + }) + .then(() => { + //Add Grafana dashboard dynamically + const isMonitor = enable_services.find(val => { + return val === menuNameMapServiceName['monitor'] + }) + if (isMonitor) { + return requestURL('/api/v1/monitor/grafana/search') + } else { + return Promise.reject() } }) - } -} - -import grafanaDash from './pages/Monitor/grafana' -import diagnose from './pages/diagnose/diagnose'; - -export function render(oldRender) { - //Add Grafana dashboard dynamically - requestURL('/api/v1/monitor/grafana/search') .then((res) => { - if (res["code"] != 0) { - return Promise.reject(res["err"]) + let arr = Object.keys(res) + if (arr.length > 0) { + if (res["code"] != 0) { + return Promise.reject(res["err"]) + } + let datas = res["data"] + //Tranfrom from grafana folder&dashboard list to antd route tree. + extraGrafanaRoutes = datas.filter((i) => i.type == "dash-folder") + .map((folder) => { + //Add the title to locales to aviod initl FormattedMessage warning in antd core. + addLocale('zh-CN', { [`menu.monitor.${folder.title}`]: folder.title }) + return { + path: `/monitor/${folder.uid}`, + name: folder.title, + routes: datas.filter((i) => i.type == "dash-db" && i.folderId == folder.id) + .map((dash) => { + addLocale('zh-CN', { [`menu.monitor.${folder.title}.${dash.title}`]: dash.title }) + return { + name: dash.title, + path: `/monitor/${folder.uid}${dash.url}`, + component: grafanaDash + } + }) + } + }) + return Promise.resolve() } - let datas = res["data"] - //Tranfrom from grafana folder&dashboard list to antd route tree. - extraGrafanaRoutes = datas.filter((i) => i.type == "dash-folder") - .map((folder) => { - //Add the title to locales to aviod initl FormattedMessage warning in antd core. - addLocale('zh-CN', { [`menu.monitor.${folder.title}`]: folder.title }) - return { - path: `/monitor/${folder.uid}`, - name: folder.title, - routes: datas.filter((i) => i.type == "dash-db" && i.folderId == folder.id) - .map((dash) => { - addLocale('zh-CN', { [`menu.monitor.${folder.title}.${dash.title}`]: dash.title }) - return { - name: dash.title, - path: `/monitor/${folder.uid}${dash.url}`, - component: grafanaDash - } - }) - } - }) - return Promise.resolve() }) .catch(err => { - message.error("Grafana doesn't work!") + const isMonitor = enable_services.find(val => { + return val === menuNameMapServiceName['monitor'] + }) + if (isMonitor) { message.error("Grafana doesn't work!") } return Promise.resolve() }) .then(() => { - return requestURL('/resource/diagnose/v1/locales.json') + return requestURL('/resource/diagnose/v2/locales.json') }) .then((res) => { - addLocale('zh-CN', res.folder) - addLocale('zh-CN', res.dashboard) - Object.entries(res.dashboard).map(item => { - let configPath = item[0].split('.') - configPath.shift() + // 1. Add diagnoses locales + Object.entries(res["locales"]).map(item => { + if (item[0] != "version") { + addLocale(item[0], item[1]); + } + }); - let path = [] - path.push({ + // 2. Add diagnoses menu and components + extraDiagnoseRoute = []; + res["menus"].forEach(item => { + // menu.diagnose.memory.memgraph + let configPath = item.split('.'); + // diagnose.memory.memgraph + configPath.shift(); + console.log(configPath); + extraDiagnoseRoute.push({ path: `/${configPath.join('/')}`, - name: configPath.pop(), - f_: `/${configPath.join('/')}`, - component: diagnose + component: diagnose_component }) - - let currentExtraDiagnoseRoute = _.chain(path).groupBy('f_').toPairs() - .map(Item => _.merge(_.zipObject(["path", "routes"], Item), { "name": Item[0].split('/').pop() })) - .value(); - let route_item = _.keyBy(extraDiagnoseRoute, 'path')[currentExtraDiagnoseRoute[0].path] - if (!!route_item) { - route_item.routes = route_item.routes.concat(currentExtraDiagnoseRoute[0].routes) - } else { - extraDiagnoseRoute = extraDiagnoseRoute.concat(currentExtraDiagnoseRoute) - } - }) - // Request services list, used to disable not running services - return requestURL("/api/v1/monitor/services/list") + }); + return requestURL("/resource/app_observable/v1/locales.json"); }) - .then(res => { - if (res.code == 0) { - for (let k in res.data) { - if (res.data[k] > 0) { - enable_services.push(k) - } + .then((res) => { + // 1. Add app_observer locales + Object.entries(res["locales"]).map(item => { + if (item[0] != "version") { + addLocale(item[0], item[1]); } - } - oldRender(); - }) - .catch(err => { - console.log(err); - oldRender(); + }); + + // 2. Add app_observer menu and components + extraAppObserverRoute = []; + res["menus"].forEach(item => { + // menu.app_observable.ntopo + let configPath = item.split('.'); + // app_observable.ntopo + configPath.shift(); + console.log(configPath); + extraAppObserverRoute.push({ + path: `/${configPath.join('/')}`, + component: appobserver_component + }) + }); + oldRender() }) } diff --git a/sysom_web/src/components/NoticeIcon/index.jsx b/sysom_web/src/components/NoticeIcon/index.jsx index f3f0406af84e1d6650ff7dafb9c8d9f11d8d86d6..fc1532bfc4f34d3f41d5a5981673f08a1e0ca33d 100644 --- a/sysom_web/src/components/NoticeIcon/index.jsx +++ b/sysom_web/src/components/NoticeIcon/index.jsx @@ -47,6 +47,7 @@ const getNoticeData = (notices) => { const NoticeIconView = () => { const [socket, setSocket] = useState(null); const [notices, setNotices] = useState([]); + const enableServices = localStorage.getItem("enableServices") const initWebSocker = () => { // 1. 获取未读告警 @@ -67,7 +68,8 @@ const NoticeIconView = () => { } useEffect(() => { if (socket) { socket.colse } - initWebSocker() + if (enableServices.includes("alarm")) { initWebSocker() } + // initWebSocker() }, []); if (socket) { socket.onmessage = (e) => { diff --git a/sysom_web/src/locales/en-US/menu.js b/sysom_web/src/locales/en-US/menu.js index d839ba7f9032055fe88f9c37726ea0bbe3b2bbf6..b135f84835e2f8c5771902cabf07d0857fa672b1 100644 --- a/sysom_web/src/locales/en-US/menu.js +++ b/sysom_web/src/locales/en-US/menu.js @@ -122,7 +122,7 @@ export default { 'menu.editor.mind': 'Mind', 'menu.editor.koni': 'Koni', 'menu.diagnose': 'Diagnosis Center', - 'menu.diagnose.oscheck': 'System Health Check', + // 'menu.diagnose.oscheck': 'System Health Check', 'menu.diagnose.cpu': 'Dispatch Diagnosis Center', 'menu.diagnose.cpu.loadtask': 'System Load Diagnosis', 'menu.diagnose.cpu.schedmoni': 'Scheduling Jitter Diagnosiss', @@ -143,6 +143,7 @@ export default { 'menu.diagnose.custom.pannel': 'Task Panel Generation', 'menu.diagnose.custom': 'Custom Diagnostic Center', 'menu.diagnose.custom.command': 'Command Diagnostics', + 'menu.diagnose.query': 'Diagnosis Query', 'menu.journal': 'Log Center', 'menu.journal.audit': 'Audit Log', 'menu.journal.task': 'Mission Log', @@ -156,8 +157,9 @@ export default { 'menu.diagnose.memory.market': 'RAM disk', 'menu.vmcore.analyse': 'Online Analysis', 'menu.app_observable': 'Application Observability', - 'menu.app_observable.net_topo': 'Network Topology', - 'menu.app_observable.mysql': 'MySQL', - 'menu.app_observable.java': 'Java', - 'menu.app_observable.process': 'Process Observability', + // 'menu.app_observable.net_topo': 'Network Topology', + // 'menu.app_observable.mysql': 'MySQL Observability', + // 'menu.app_observable.java': 'Java Observability', + // 'menu.app_observable.process': 'Process Observability', + // 'menu.app_observable.nginx': 'Nginx Observability', }; diff --git a/sysom_web/src/locales/en-US/pages.js b/sysom_web/src/locales/en-US/pages.js index d21b5184eadcb5cbe1a5365d2474ec0020e464c4..68ffc8ac2ece78e83f11836ec53f67cd7a64b026 100644 --- a/sysom_web/src/locales/en-US/pages.js +++ b/sysom_web/src/locales/en-US/pages.js @@ -128,6 +128,10 @@ export default { 'pages.alarm.list.alert_category_monitor': 'Monitor', 'pages.alarm.list.alert_category_application': 'Application', 'pages.alarm.list.alert_category_other': 'Other', + 'pages.alarm.list.alert_level': 'Level', + 'pages.alarm.list.alert_level_warning': 'Warning', + 'pages.alarm.list.alert_level_error': 'Error', + 'pages.alarm.list.alert_level_critical': 'Critical', 'pages.alarm.list.mark_as_read': 'Mark as read', 'pages.security.list.index': 'Serial Number', 'pages.security.list.cve_id': 'Serial Number', @@ -399,6 +403,8 @@ export default { 'pages.diagnose.startdiagnosis': 'Start diagnosis', 'pages.diagnose.creationtime': 'Creation time', 'pages.diagnose.diagnosisID': 'Diagnosis ID', + 'pages.diagnose.diagnosisChannels': 'Diagnosis Channels', + 'pages.diagnose.diagnosisName': 'Diagnosis Name', 'pages.diagnose.diagnosisParams': 'Diagnosis parameters', 'pages.diagnose.diagnosisCommand': 'Diagnosis command', 'pages.diagnose.state': 'State', @@ -406,6 +412,7 @@ export default { 'pages.diagnose.completediagnosis': 'Complete diagnosis', 'pages.diagnose.anomaly': 'Anomaly', 'pages.diagnose.operation': 'Operation', + 'pages.diagnose.reset': 'Reset', 'pages.diagnose.viewdiagnosisresults': 'Viewing diagnosis results', 'pages.diagnose.viewdiagnosisdetail': 'Viewing diagnosis detail', 'pages.diagnose.viewerrormessages': 'Viewing error messages', diff --git a/sysom_web/src/locales/zh-CN/menu.js b/sysom_web/src/locales/zh-CN/menu.js index 4c4f155135a5afb41b2c2985d3448849c2ac78e3..0ec7130147b95b2a79ba6dee8e6acedfbd96f5ed 100644 --- a/sysom_web/src/locales/zh-CN/menu.js +++ b/sysom_web/src/locales/zh-CN/menu.js @@ -72,7 +72,7 @@ export default { 'menu.editor.mind': '脑图编辑器', 'menu.editor.koni': '拓扑编辑器', 'menu.diagnose': '诊断中心', - 'menu.diagnose.oscheck': '系统健康检查', + // 'menu.diagnose.oscheck': '系统健康检查', 'menu.diagnose.cpu': '调度诊断中心', 'menu.diagnose.cpu.loadtask': '系统负载诊断', 'menu.diagnose.cpu.schedmoni': '调度抖动诊断', @@ -93,6 +93,7 @@ export default { 'menu.diagnose.custom.pannel': '诊断面板配置生成', 'menu.diagnose.custom': '自定义诊断中心', 'menu.diagnose.custom.command': '命令诊断', + 'menu.diagnose.query': '诊断查询', 'menu.journal': '日志中心', 'menu.journal.audit': '审计日志', 'menu.journal.task': '任务日志', @@ -110,11 +111,14 @@ export default { 'menu.hotfix.formal': '热补丁列表', 'menu.hotfix.version': '自定义内核版本配置', 'menu.hotfix.selfdefine': '自定义热补丁制作', + 'menu.hotfix.released_hotfix': '历史发布热补丁列表', 'menu.hotfix.version.config': '操作系统配置', 'menu.hotfix.version.customize': '内核版本配置', 'menu.app_observable': '应用可观测', - 'menu.app_observable.net_topo': '网络拓扑', - 'menu.app_observable.mysql': 'MySQL', - 'menu.app_observable.java': 'Java', - 'menu.app_observable.process': '进程可观测', + // 'menu.app_observable.net_topo': '网络拓扑', + // 'menu.app_observable.mysql': 'MySQL可观测', + // 'menu.app_observable.java': 'Java可观测', + // 'menu.app_observable.process': '进程可观测', + // 'menu.app_observable.nginx': 'Nginx可观测', + // 'menu.app_observable.jitter': '抖动可观测' }; \ No newline at end of file diff --git a/sysom_web/src/locales/zh-CN/pages.js b/sysom_web/src/locales/zh-CN/pages.js index 75faeb49f1a80ba7032b0a12b5ee7a1584483a52..f665e4b46d856e01203d21c9d478cd88f189bdaa 100644 --- a/sysom_web/src/locales/zh-CN/pages.js +++ b/sysom_web/src/locales/zh-CN/pages.js @@ -55,6 +55,7 @@ export default { 'pages.hostTable.password': '密码', 'pages.hostTable.password_required': '密码是必填项!', 'pages.hostTable.status': '主机状态', + 'pages.hostTable.status.ready': '准备中', 'pages.hostTable.status.offline': '离线', 'pages.hostTable.status.running': '运行中', 'pages.hostTable.status.abnormal': '异常', @@ -129,6 +130,10 @@ export default { 'pages.alarm.list.alert_category_monitor': '监控告警', 'pages.alarm.list.alert_category_application': '应用告警', 'pages.alarm.list.alert_category_other': '其它告警', + 'pages.alarm.list.alert_level': '告警级别', + 'pages.alarm.list.alert_level_warning': '警告', + 'pages.alarm.list.alert_level_error': '错误', + 'pages.alarm.list.alert_level_critical': '严重', 'pages.alarm.list.mark_as_read': '标记已读', 'pages.security.list.index': '序号', 'pages.security.list.cve_id': '编号', @@ -242,6 +247,17 @@ export default { 'pages.hotfix.tooltips.building_image': '请输出在构建改操作系统的热补丁时需要用到的构建镜像', 'pages.hotfix.tooltips.query_time': '输入查询时间,将会返回该时间以前所有的正式包hotfix', 'pages.hotfix.tooltips.source_code_method': '如果为git方式管理,请输入该内核版本的源码branch或tag;如果使用源码包,请填入源码包下载地址', + 'pages.hotfix.tooltips.hotfixid': '热补丁ID', + 'pages.hotfix.tooltips.released_kernelverison': '已经发布的该热补丁的内核版本号,包含架构信息', + 'pages.hotfix.tooltips.serious_level': '推荐安装的级别,推荐安装一般涉及到系统稳定性和安全性补丁,需要安装涉及明确和出现概率较大的系统稳定性和安全性补丁;其他级别用户可自行判断', + 'pages.hotfix.tooltips.fix_system': '该热补丁修复涉及的子系统', + 'pages.hotfix.tooltips.description': '该热补丁的发布描述', + 'pages.hotfix.tooltips.released_time': '该热补丁发布的时间', + 'pages.hotfix.tooltips.downloadlink': '补丁下载链接', + 'pages.hotfix.tooltips.deprecated': '如果为是,说明该热补丁已经被废弃,进一步信息可见废弃信息', + 'pages.hotfix.tooltips.deprecated_info': '用于描述该热补丁废弃的信息', + 'pages.hotfix.tooltips.modified_time': '热补丁信息修改时间', + 'pages.hotfix.tooltips.modufied_user': '热补丁信息修改的用户', 'pages.hotfix.confirm.cancel': '是否确定取消?', 'pages.hotfix.confirm.delete_kernelversion': '确定要删除该内核版本?', 'pages.hotfix.confirm.delete_ostype': '是否要删除该类型?注意,删除类型会连带删除该类型关联的所有内核版本!', @@ -266,6 +282,25 @@ export default { 'pages.hotfix.building_image' : '该内核版本的构建镜像', 'pages.hotfix.use_src_rpm': '使用.src.rpm源码包', 'pages.hotfix.submit': '提交', + 'pages.hotfix.hotfixid': '热补丁ID', + 'pages.hotfix.released_kernel_version': '已发布的内核版本', + 'pages.hotfix.serious': '推荐级别', + 'pages.hotfix.description': '热补丁描述', + 'pages.hotfix.fix_system': '修复子系统', + 'pages.hotfix.released_time': '发布时间', + 'pages.hotfix.download_link': '下载链接', + 'pages.hotfix.download_link_error': '下载链接错误', + 'pages.hotfix.deprecated': '已废弃', + 'pages.hotfix.deprecated_info': '热补丁废弃信息', + 'pages.hotfix.modified_time': '最近修改时间', + 'pages.hotfix.modified_user': '修改的用户', + 'pages.hotfix.released_list': '已发布热补丁列表', + 'pages.hotfix.historicalfilter': '历史热补丁发布信息筛选', + 'pages.hotfix.create_hotfix_config': '添加HotFix发布信息', + 'pages.hotfix.create_hotfix': '添加hotfix', + 'pages.hotfix.serious_explain': '推荐信息简介', + 'pages.hotfix.edit_released_hotfix': '编辑Hotfix Released 配置', + 'pages.hotfix.bulk_import_hotfix_released': '批量导入hotfix released', 'pages.account.account_list': '账号列表', 'pages.account.username': '用户名', 'pages.account.password': '密码', @@ -405,6 +440,8 @@ export default { 'pages.diagnose.startdiagnosis': '开始诊断', 'pages.diagnose.creationtime': '创建时间', 'pages.diagnose.diagnosisID': '诊断ID', + 'pages.diagnose.diagnosisChannels': '诊断通道', + 'pages.diagnose.diagnosisName': '诊断名称', 'pages.diagnose.diagnosisParams': '诊断参数', 'pages.diagnose.diagnosisCommand': '诊断命令', 'pages.diagnose.state': '状态', @@ -412,9 +449,16 @@ export default { 'pages.diagnose.completediagnosis': '诊断完毕', 'pages.diagnose.anomaly': '异常', 'pages.diagnose.operation': '操作', + 'pages.diagnose.reset': '重置', 'pages.diagnose.viewdiagnosisresults': '查看诊断结果', 'pages.diagnose.viewdiagnosisdetail': '查看诊断详情', 'pages.diagnose.viewerrormessages': '查看出错信息', + 'page.diagnose.copycommand': 'copy 命令', + 'pages.diagnose.uploadtaskresult': '上传结果', + 'pages.diagnose.uploadtaskresultfrom': '上传任务结果表单', + 'pages.diagnose.offlinetaskresult': '离线任务结果', + 'pages.diagnose.offlinetaskresultannex': '离线任务附件', + 'pages.diagnose.selectencoding': '选择内容编码方式', 'pages.diagnose.nooperation': '暂无可用操作', 'pages.diagnose.checkitem': '检查项目', 'pages.diagnose.normal': '正常', diff --git a/sysom_web/src/models/panelGenerator.js b/sysom_web/src/models/panelGenerator.js deleted file mode 100644 index 87040551a3fe379dfc3c33e1e68042253d42bc05..0000000000000000000000000000000000000000 --- a/sysom_web/src/models/panelGenerator.js +++ /dev/null @@ -1,22 +0,0 @@ -import { useCallback, useState } from "react"; - -export default () => { - const [configStore, setConfigStore] = useState({ - servicename: 'test', - taskform: [], - pannels: [], - version: 1 - }) - const [isDraggingNow, setIsDraggingNow] = useState(false) - const changeConfigStore = useCallback((value) => { - setConfigStore(() => { - return value - }) - }, []) - return { - configStore, - setConfigStore, - isDraggingNow, - setIsDraggingNow - } -} diff --git a/sysom_web/src/models/panelGeneratorGlobalVariables.js b/sysom_web/src/models/panelGeneratorGlobalVariables.js deleted file mode 100644 index 71a0e086c16db055b0964e4cff914ad182b169db..0000000000000000000000000000000000000000 --- a/sysom_web/src/models/panelGeneratorGlobalVariables.js +++ /dev/null @@ -1,17 +0,0 @@ -import { useCallback, useState } from "react"; - -export default () => { - const [globalVariableStore, setGlobalVariableStore] = useState([{ - name: 'Example', - value: 'Example Value' - }]) - const changeGlobalVariableStore = useCallback((value) => { - setGlobalVariableStore(() => { - return value - }) - }, []) - return { - globalVariableStore: globalVariableStore, - setGlobalVariableStore: setGlobalVariableStore - } -} diff --git a/sysom_web/src/models/panelGeneratorMock.js b/sysom_web/src/models/panelGeneratorMock.js deleted file mode 100644 index 243d08524d8a5af314b1a43ee75d645a438fc99e..0000000000000000000000000000000000000000 --- a/sysom_web/src/models/panelGeneratorMock.js +++ /dev/null @@ -1,16 +0,0 @@ -import { useCallback, useState } from "react"; - -export default () => { - const [mockStore, setMockStore] = useState({ - datas:{} - }) - const changeMockStore = useCallback((value) => { - setMockStore(() => { - return value - }) - }, []) - return { - mockStore, - setMockStore - } -} diff --git a/sysom_web/src/pages/account/components/EditAccountModal.jsx b/sysom_web/src/pages/account/components/EditAccountModal.jsx index b82ba45ffc3b42a483e2b56587e96163a164f4ee..3a20bce3dfea6e33b3d2e8c6befbda7e540d5d3f 100644 --- a/sysom_web/src/pages/account/components/EditAccountModal.jsx +++ b/sysom_web/src/pages/account/components/EditAccountModal.jsx @@ -80,12 +80,13 @@ const EditAccountModal = (props) => { defaultMessage="username" /> } - placeholder={ - - } + // 国际化文件中缺少pages.account.input_username字段,导致找不到而渲染为[Object object] + // placeholder={ + // + // } rules={[ { required: true, @@ -124,12 +125,13 @@ const EditAccountModal = (props) => { defaultMessage="description" /> } - placeholder={ - - } + // 国际化文件中缺少pages.account.input_description字段,导致找不到而渲染为[Object object] + // placeholder={ + // + // } /> ); diff --git a/sysom_web/src/pages/alarm/list/index.jsx b/sysom_web/src/pages/alarm/list/index.jsx index bde22ded117c06abcb3181a96749b039c1ff8e50..4361d9ae672071d1f20d2713257af40c9360e127 100644 --- a/sysom_web/src/pages/alarm/list/index.jsx +++ b/sysom_web/src/pages/alarm/list/index.jsx @@ -73,6 +73,31 @@ const AlarmDataList = () => { }, }, }, + { + title: , + dataIndex: 'alert_level', + filters: true, + valueEnum: { + "WARNING": { + text: ( + + ), + status: 'Warning', + }, + "ERROR": { + text: ( + + ), + status: 'Error', + }, + "CRITICAL": { + text: ( + + ), + status: 'Error', + }, + }, + }, { title: , dataIndex: 'labels', diff --git a/sysom_web/src/pages/app_observable/components/multiGrafanaPannel.js b/sysom_web/src/pages/app_observable/components/multiGrafanaPannel.js new file mode 100644 index 0000000000000000000000000000000000000000..e1abcb23444367e80ac244120f8f2ec4d15fc545 --- /dev/null +++ b/sysom_web/src/pages/app_observable/components/multiGrafanaPannel.js @@ -0,0 +1,138 @@ +import { Tabs } from 'antd'; +import { useState, useEffect, useRef} from 'react'; +import { useIntl } from 'umi'; +import GrafanaWrap from '../../Monitor/grafana' + + +/** + * 封装 useState,使其具有 getState 方法,保证在回调函数中获取到的 state 值是最新的 + * (由于闭包,直接在回调函数中使用state获取到的不是最新值) + * @param {*} initVal + * @reference https://www.haorooms.com/post/usegetstate_hooks + * @returns + */ +const useGetState = (initVal) => { + const [state, setState] = useState(initVal); + const ref = useRef(initVal); + const setStateCopy = (newVal) => { + ref.current = newVal; + setState(newVal); + } + const getState = () => ref.current; + return [state, setStateCopy, getState]; +} + +/** + * 多面板配置 +{ + "menuName": "menu.app_observable.nginx", + "type": "multiGrafanaPannel", + "config": [ + { + "pannelId": "nginx_monitor", + "pannelName": "pages.app_observable.monitor_dashboard", + "pannelUrl": "/grafana/d/6Mztrm4Ik/nginx" + }, + { + "pannelId": "nginx_event", + "pannelName": "pages.app_observable.abnormal_events", + "pannelUrl": "/grafana/d/HtuWUeSSz/nginx-event" + } + ] +} + * @param {*} props + */ +const MultiGrafanaPannel = (props) => { + const intl = useIntl(); + let pannels = props.config + let pannelMap = {} + pannels.map((pannel) => { + pannelMap[pannel.pannelId] = pannel + }) + let pannelGetter = {} + let pannelSetter = {} + pannels.map((pannel) => { + const [getter, setter] = useGetState(pannel.pannelUrl); + pannelGetter[pannel.pannelId] = getter + pannelSetter[pannel.pannelId] = setter + }) + + const [currentTab, setCurrentTab, getCurrentTab] = useGetState(pannels[0]["pannelId"]); + + useEffect(() => { + const queryParams = !!props.location?.query ? props.location.query : {}; + if ('_currentTab' in queryParams) { + let _currentTab = queryParams._currentTab; + if (_currentTab in pannelGetter) { + setCurrentTab(_currentTab); + } + } + + // queryParams to query string + const queryStr = Object.keys(queryParams).map(key => key + '=' + queryParams[key]).join('&').trim(); + if (queryStr.length > 0) { + for (let pannelId in pannelGetter) { + pannelSetter[pannelId](pannelMap[pannelId]["pannelUrl"] + `?${queryStr}`); + } + } + }, []); + + let items = pannels.map((pannel) => { + return { + label: intl.formatMessage({ + id: pannel.pannelName, + defaultMessage: pannel.pannelName, + }), + key: pannel.pannelId, + forceRender: true, + children: ( +
+ { + currentTab == pannel.pannelId ? + { + if (newUrl.trim() == "about:blank") { + return + } + if (getCurrentTab() == pannel.pannelId) { + let targetPannelId = pannel.pannelId; + for (let otherPannelId in pannelGetter) { + if (otherPannelId == pannel.pannelId) { + continue + } + pannelSetter[otherPannelId](newUrl.replace(pannelMap[pannel.pannelId]["pannelUrl"], pannelMap[otherPannelId]["pannelUrl"])); + if (newUrl.indexOf(pannelMap[otherPannelId]["pannelUrl"]) >= 0) { + targetPannelId = otherPannelId; + } + } + if (targetPannelId != pannel.pannelId) { + setCurrentTab(targetPannelId); + } + } + }} + urlChangeInvokeInterval={50} + /> + : + {pannel.pannelId} + } +
+ ) + } + }); + + return ( +
+ { + setCurrentTab(key); + }} + /> +
+ ) +}; + +export default MultiGrafanaPannel; \ No newline at end of file diff --git a/sysom_web/src/pages/app_observable/components/singleGrafanaPannel.js b/sysom_web/src/pages/app_observable/components/singleGrafanaPannel.js new file mode 100644 index 0000000000000000000000000000000000000000..711014ffd67efdc0ad25779094ef722caed130f3 --- /dev/null +++ b/sysom_web/src/pages/app_observable/components/singleGrafanaPannel.js @@ -0,0 +1,39 @@ +import GrafanaWrap from '../../Monitor/grafana' + +/** + * 单面板配置 +{ + "menuName": "menu.app_observable.ntopo", + "type": "multiGrafanaPannel", + "config": { + "pannelId": "ntopo", + "pannelName": "", + "pannelUrl": "/grafana/d/H04tHN34k/ntopo" + }, + "locales": { + "zh-CN": { + "menu.app_observable.ntopo": "网络拓扑" + }, + "en-US": { + "menu.app_observable.ntopo": "Network Topology" + } + } +} + * @param {*} props + */ +const SingleGrafanaPannel = (props) => { + const queryParams = !!props.queryParams ? props.queryParams : {}; + // queryParams to query string + const queryStr = Object.keys(queryParams).map(key => key + '=' + queryParams[key]).join('&').trim(); + let targetUrl = props.config.pannelUrl; + if (queryStr.length > 0) { + targetUrl += `?${queryStr}` + } + return ( +
+ +
+ ) +}; + +export default SingleGrafanaPannel; \ No newline at end of file diff --git a/sysom_web/src/pages/app_observable/index.js b/sysom_web/src/pages/app_observable/index.js new file mode 100644 index 0000000000000000000000000000000000000000..fe85e16458d84acb173932d2e7cbd2889baa7b0f --- /dev/null +++ b/sysom_web/src/pages/app_observable/index.js @@ -0,0 +1,42 @@ +import { useEffect, useState } from 'react'; +import { request } from 'umi'; +import MultiGrafanaPannel from './components/multiGrafanaPannel'; +import SingleGrafanaPannel from './components/singleGrafanaPannel'; + +const components = { + "multiGrafanaPannel": MultiGrafanaPannel, + "singleGrafanaPannel": SingleGrafanaPannel +} + +/** + * 应用观测动态渲染 + * @returns + */ +const AppObservable = (props) => { + const [pannelConfig, setPannelConfig] = useState(); + useEffect(() => { + console.log("useEffect"); + // Get config + let urlslice = props.match.url.split("/"); + urlslice.splice(2, 0, "v1"); + request(`/resource${urlslice.join("/").toLowerCase()}.json`) + .then((res) => { + console.log("setPannelConfig", res); + setPannelConfig(res); + }) + .catch((err) => { + console.log(err); + }); + }, []); + if (!pannelConfig) { + return <>; + } + let pannelType = pannelConfig.type; + if (!(pannelType in components)) { + return <>; + } + let Component = components[pannelType]; + return (); +}; + +export default AppObservable; \ No newline at end of file diff --git a/sysom_web/src/pages/app_observable/mysql/index.js b/sysom_web/src/pages/app_observable/mysql/index.js index 93c1cf95b0cf2a7772971aebb2c93cb74f29953f..a7532ed8b9c750f01f5a610f6159051f4c78bd27 100644 --- a/sysom_web/src/pages/app_observable/mysql/index.js +++ b/sysom_web/src/pages/app_observable/mysql/index.js @@ -1,28 +1,5 @@ -import { Tabs } from 'antd'; -import { useState, useEffect, useRef } from 'react'; -import GrafanaWrap from '../../Monitor/grafana' import { useIntl } from 'umi'; - -const monitorPannelTargetUrlBase = '/grafana/d/hOk70b34k/app-mysql'; -const abnormalEventsTargetUrlBase = '/grafana/d/Ub__1x3Vz/app-mysql-events'; - -/** - * 封装 useState,使其具有 getState 方法,保证在回调函数中获取到的 state 值是最新的 - * (由于闭包,直接在回调函数中使用state获取到的不是最新值) - * @param {*} initVal - * @reference https://www.haorooms.com/post/usegetstate_hooks - * @returns - */ -const useGetState = (initVal) => { - const [state, setState] = useState(initVal); - const ref = useRef(initVal); - const setStateCopy = (newVal) => { - ref.current = newVal; - setState(newVal); - } - const getState = () => ref.current; - return [state, setStateCopy, getState]; -} +import MultiGrafanaPannel from '../components/multiGrafanaPannel'; /** * MySQL应用观测 @@ -30,100 +7,29 @@ const useGetState = (initVal) => { */ const AppObservableMysql = (props) => { const intl = useIntl(); - const [monitorPannelTargetUrl, setMonitorPannelTargetUrl] = useState(monitorPannelTargetUrlBase); - const [abnormalEventsTargetUrl, setAbnormalEventsTargetUrl] = useState(abnormalEventsTargetUrlBase); - const [currentTab, setCurrentTab, getCurrentTab] = useGetState('monitor_panenl'); - - useEffect(() => { - const queryParams = !!props.location?.query ? props.location.query : {}; - console.log(queryParams); - if ('_currentTab' in queryParams) { - let _currentTab = queryParams._currentTab; - if (_currentTab == 'abnormal_events') { - setCurrentTab('abnormal_events'); - } else { - setCurrentTab('monitor_panenl'); - } - } - // queryParams to query string - const queryStr = Object.keys(queryParams).map(key => key + '=' + queryParams[key]).join('&').trim(); - if (queryStr.length > 0) { - setMonitorPannelTargetUrl(monitorPannelTargetUrlBase + `?${queryStr}`); - setAbnormalEventsTargetUrl(abnormalEventsTargetUrlBase + `?${queryStr}`); - } - }, []); - - const monitor_panenl = { - label: intl.formatMessage({ - id: 'pages.app_observable.monitor_dashboard', - defaultMessage: 'Monitor Dashboard', - }), - key: "monitor_panenl", - forceRender: true, - children: ( -
+ return ( + { - if (getCurrentTab() == 'monitor_panenl') { - setAbnormalEventsTargetUrl(newUrl.replace(monitorPannelTargetUrlBase, abnormalEventsTargetUrlBase)); - if (newUrl.indexOf(abnormalEventsTargetUrlBase) >= 0) { - setCurrentTab('abnormal_events'); - } - } - }} - urlChangeInvokeInterval={100} - /> - : - monitor_panenl - } -
- ) - } - - const abnormal_events = { - label: intl.formatMessage({ - id: 'pages.app_observable.abnormal_events', - defaultMessage: 'Abnormal Events', - }), - key: "abnormal_events", - forceRender: true, - children: ( -
+ "pannelId": "mysql_monitor", + "pannelName": intl.formatMessage({ + id: 'pages.app_observable.monitor_dashboard', + defaultMessage: 'Monitor Dashboard', + }), + "pannelUrl": '/grafana/d/hOk70b34k/app-mysql', + }, { - currentTab == "abnormal_events" ? - { - if (getCurrentTab() == 'abnormal_events') { - setMonitorPannelTargetUrl(newUrl.replace(abnormalEventsTargetUrlBase, monitorPannelTargetUrlBase)); - if (newUrl.indexOf(monitorPannelTargetUrlBase) >= 0) { - setCurrentTab('monitor_panenl'); - } - } - }} - urlChangeInvokeInterval={100} - /> - : - abnormal_events + "pannelId": "mysql_event", + "pannelName": intl.formatMessage({ + id: 'pages.app_observable.abnormal_events', + defaultMessage: 'Abnormal Events', + }), + "pannelUrl": '/grafana/d/Ub__1x3Vz/app-mysql-events', } - -
- ) - } - return ( -
- { - setCurrentTab(key); - }} - /> -
+ ]} + {...props} + /> ) }; + export default AppObservableMysql; \ No newline at end of file diff --git a/sysom_web/src/pages/app_observable/nginx/index.js b/sysom_web/src/pages/app_observable/nginx/index.js new file mode 100644 index 0000000000000000000000000000000000000000..1115344c015f273a8c07b6d9209744493934b129 --- /dev/null +++ b/sysom_web/src/pages/app_observable/nginx/index.js @@ -0,0 +1,34 @@ +import MultiGrafanaPannel from '../components/multiGrafanaPannel'; +import { useIntl } from 'umi'; + +/** + * Nginx应用观测 + * @returns + */ +const AppObservableNginx = (props) => { + const intl = useIntl(); + return ( + + ) +}; +export default AppObservableNginx; \ No newline at end of file diff --git a/sysom_web/src/pages/diagnose/components/Dashboard.jsx b/sysom_web/src/pages/diagnose/components/Dashboard.jsx index aaf7fdbb6b619b3f6c8bad397959789a050c8f35..e719cbff066f1b4fa8be69e263dfa52647cb87e8 100644 --- a/sysom_web/src/pages/diagnose/components/Dashboard.jsx +++ b/sysom_web/src/pages/diagnose/components/Dashboard.jsx @@ -19,7 +19,7 @@ const templateReplace = (template, Vars = []) => { }; //parentData is for create local popup pannel by parent pannel -const createPannel = (pannel, datas, globalVariables, showModalPannel, parentData = {},) => { +const createPannel = (pannel, datas, globalVariables, refreshTask, showModalPannel, parentData = {}) => { const pannelMap = { stat: StatisticPannel, row: RowPannel, @@ -51,6 +51,7 @@ const createPannel = (pannel, datas, globalVariables, showModalPannel, parentDat datas={datas} globalVariables={globalVariables} showModalPannel={showModalPannel} + refreshTask={refreshTask} /> ) } @@ -69,7 +70,7 @@ const RowPannel = (props) => { { configs.children.map(pannel => ( - {createPannel(pannel, datas, globalVariables)} + {createPannel(pannel, datas, globalVariables, props.refreshTask)} ))} @@ -139,8 +140,7 @@ const Dashboard = (props) => { const [pannelModal, setPannelModal] = useState({ visible: false }); //parentData is for create local popup pannel by parent pannel const showModalPannel = (pannel, parentData = {}) => { - console.log("showModalPannel", parentData) - const PopupModalPannel = createPannel(pannel, datas, globalVariables, showModalPannel, parentData) + const PopupModalPannel = createPannel(pannel, datas, globalVariables, props.refreshTask, showModalPannel, parentData) setPannelModal({ visible: true, pannel: PopupModalPannel }); } const handleOk = () => { @@ -157,7 +157,7 @@ const Dashboard = (props) => { { pannels.map(pannel => { const datasource = templateReplace(pannel.datasource, globalVariables) - return createPannel(pannel, datas, globalVariables, showModalPannel) + return createPannel(pannel, datas, globalVariables, props.refreshTask, showModalPannel) }) } diff --git a/sysom_web/src/pages/diagnose/components/OfflineImportModal.jsx b/sysom_web/src/pages/diagnose/components/OfflineImportModal.jsx index 51495a6a57f8b8a2571bacfdc84a2fb54b377870..d85190b43516c96207fc3f31c418dc53e942e7d2 100644 --- a/sysom_web/src/pages/diagnose/components/OfflineImportModal.jsx +++ b/sysom_web/src/pages/diagnose/components/OfflineImportModal.jsx @@ -1,4 +1,4 @@ -import { ModalForm, ProFormText, ProFormTextArea, ProFormSelect } from '@ant-design/pro-form'; +import { ModalForm, ProFormTextArea, ProFormDigit, ProFormText, ProFormSelect } from '@ant-design/pro-form'; import { forwardRef } from 'react'; import * as PropTypes from 'prop-types'; import { useIntl, FormattedMessage } from 'umi'; @@ -18,7 +18,10 @@ let OfflineImportModal = (props, ref) => { visible, modalWidth, onVisibleChange, - onFinish + onFinish, + taskForm, + serviceName, + queryParams, } = props; const intl = useIntl(); @@ -46,29 +49,50 @@ let OfflineImportModal = (props, ref) => { }) }} > +