diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..cd1b45813cf1fd9a10d035f99a4418856769a60d Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..638c00f5c90f638d912f12138c606a38fe3adbbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Python 编译文件 +__pycache__/ + +# 虚拟环境 +venv/ +.env/ +.venv/ +env/ + +# 包安装目录 +*.egg-info/ +*.egg +dist/ +build/ + +# 缓存 +.pytest_cache/ + +# IDE 配置文件 +.vscode/ +.idea/ +*.swp +*.swo + +# Jupyter Notebook +.ipynb_checkpoints/ + +# 本地配置文件(如数据库、密钥) +.env.local diff --git a/README.md b/README.md index 1221e31ef4dffe3cff740e5ca695677d21af7a35..d87a19c3f8e100ed7282ae2cc752eaa9a8988580 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ ``` + # Notice - The OpenCloudOS-tools project is currently under refactoring, and upon completion, it will be reopened as an open-source project. Stay tuned for updates. + # OpenCloudOS Toolkit - OpenCloudOS-tools is a useful toolkit, which allows: Querying and changing settings for OpenCloudOS, such as system information; upgrade system. diff --git a/completions/oc b/completions/oc new file mode 120000 index 0000000000000000000000000000000000000000..703f9920916184705907f5530a708d07cc2f22f9 --- /dev/null +++ b/completions/oc @@ -0,0 +1 @@ +ocos \ No newline at end of file diff --git a/completions/oc-ops b/completions/oc-ops new file mode 100644 index 0000000000000000000000000000000000000000..048da5ff9456ef6339698d9027d46f248b6f880f --- /dev/null +++ b/completions/oc-ops @@ -0,0 +1,33 @@ +#!/bin/bash + +_ocops_completions() +{ + local result + local n=${#COMP_WORDS[@]} + n=$((n-1)) + + local cmd=(ocos ops "${COMP_WORDS[@]}") + cmd[2]="--bashcomp" + unset cmd[$((n+3))] + result="$(${cmd[@]} /dev/null)" + local lead="${result%% *}" + if [ -z "${lead##*:*}" -o -z "${result##*$'\n' *}" -o -z "${result##*$'\t'*}" ]; then + result= + fi + + if [ -n ".$result" ]; then + local IFS=$'\n' + local sug=($(compgen -W "$result" -- ${COMP_WORDS[$n]})) + if [ "${#sug[@]}" == "1" ]; then + local number="${sug[0]/%\ */}" + COMPREPLY=("$number") + else + for i in "${!sug[@]}"; do + sug[$i]="$(printf '%*s' "-$COLUMNS" "${sug[$i]}")" + done + + COMPREPLY=("${sug[@]}") + fi + fi +} +complete -F _ocops_completions oc-ops diff --git a/completions/ocos b/completions/ocos new file mode 100644 index 0000000000000000000000000000000000000000..1155c0bb7cc6daac5567028522f5899f557179d7 --- /dev/null +++ b/completions/ocos @@ -0,0 +1,59 @@ +#!/bin/bash + +_ocos_completions() +{ + local result + local n=${#COMP_WORDS[@]} + n=$((n-1)) + + if [ "$n" == 1 ]; then + result="$(&2 + echo "$usage" + exit 1;; + esac + shift +done + + diff --git a/ocos-comp.txt b/ocos-comp.txt new file mode 100644 index 0000000000000000000000000000000000000000..303c28f214924bbdc285ee9b699263adb458b4aa --- /dev/null +++ b/ocos-comp.txt @@ -0,0 +1,12 @@ +update [rpm_name] Update the system +install rpm_name install rpms +show Show the system version +check [rpm_name] Check the modified rpms +analyze Analyze the system performance +check-update Check available package updates +backup [reboot] Backup the system online, or reboot to backup +reinstall Recover or Reinstall the system +recover Recover or Reinstall the system +ops Operation tools +help Show this usage +version Show the script version diff --git a/ops/VERSION b/ops/VERSION new file mode 100644 index 0000000000000000000000000000000000000000..a50236ab50ffdc61dc704e9885860b8341b74402 --- /dev/null +++ b/ops/VERSION @@ -0,0 +1 @@ +4.2.12 diff --git a/ops/cpu/irq/latency/irq_latency.sh b/ops/cpu/irq/latency/irq_latency.sh new file mode 100755 index 0000000000000000000000000000000000000000..43b777fd77c7aa1491bc216aa6c65b0ba1914930 --- /dev/null +++ b/ops/cpu/irq/latency/irq_latency.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +usage="\ +Usage: + oc-ops cpu irq latency [-e val] [-f freq_ms] [-t threshold_ms] [-c] [-k] [-h] + COMMAND-LINE Options: + -e 使能/去使能irq latency的检测,-e参数后面只能传两种值之一: 1 或者 0,1表示开启监控,0表示关闭监控 + -f 设置监控时采样的频率所对应的时间,单位为ms,取值范围为[4ms, 1000ms] (默认值为10ms) + -t 设置latency的阈值,单位为ms,取值范围为[freq_ms, 30000ms] (默认值为30ms);中断响应latency超过阈值时相关的栈信息将打印出来; + 上限设置的原因:如果关中断的时延超过30000ms,soft lockup特性能检测出来 + -c clear已积累的栈信息 + -k 为避免忘记关闭检测而对系统造成污染,oc-ops cpu irq latency将默认在3600s后关闭检测;-k表示 keep irq latency的检测,永远不会自动关闭 + -h 显示 oc-ops cpu irq latency 的用法 + +其他信息: + 使能后,每60s自动打印检测的结果;如想主动看检测结果,可执行下面两条命令主动获取: + cat /proc/irq_latency/trace_dist + cat /proc/irq_latency/trace_stack +" +if (( $# < 1 )); then + echo "$usage" + exit 1 +fi + +lsmod | grep irqlatency > /dev/null +if (( $? != 0 )); then + modprobe irqlatency 1>/dev/null 2>&1 + if (( $? != 0 )); then + echo "Error: modprobe irqlatency fail! We add the feature recently, so, please use a newer kernel verison." + arch_val=$( uname -a | grep "x86" | wc -l ) + if (( $arch_val == 0 )); then + echo "Note: We only support irqlatency on x86/x86_64 architecture." + fi + exit 1 + fi +fi + +workdir=$(readlink /proc/$$/fd/255); workdir=$(dirname $workdir); cd "$workdir" + +irq_latency_enable=0 +freq_ms=10 +threshold_ms=30 +keep_monitor=0 +exit_monitor=3600 + +clear_stack_info() +{ + echo "0" > /proc/irq_latency/trace_stack +} + +disable_irq_latency() +{ + echo "0" > /proc/irq_latency/enable + clear_stack_info + killall irq_latency_show.sh 2>/dev/null + exit 0 +} + +enable_val_set() +{ + if (( $irq_latency_enable != 0 && $irq_latency_enable != 1 )); then + echo "Error: -e val is $irq_latency_enable, which must be 0 or 1 !" ; exit 1 + fi + + if (( $irq_latency_enable == 0 )); then + disable_irq_latency + else + echo "1" > /proc/irq_latency/enable + ./irq_latency_show.sh & + fi +} + +freq_ms_set() +{ + if (( $freq_ms < 4 || $freq_ms > 1000 )); then + echo "Error: -f freq_ms is $freq_ms, which should be in [4ms, 1000ms] !" ; exit 1 + fi + + echo "$freq_ms" > /proc/irq_latency/freq_ms +} + +threshold_ms_set() +{ + if (( $threshold_ms < $freq_ms || $threshold_ms > 30000 )); then + echo "Error: -t threshold_ms is $threshold_ms, which should be in [freq_ms, 1000ms] !" ; exit 1 + fi + + echo "$threshold_ms" > /proc/irq_latency/latency_thresh_ms +} + +while getopts 'e:f:t:q:ckh' OPT; do + case $OPT in + e) irq_latency_enable="$OPTARG" + enable_val_set + ;; + f) freq_ms_set="$OPTARG" + freq_ms_set + ;; + t) threshold_ms_set="$OPTARG" + threshold_ms_set + ;; + c) clear_stack_info + ;; + q) exit_monitor="$OPTARG" + ;; + k) keep_monitor=1 + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +if (( $irq_latency_enable == 1 && $keep_monitor == 0 )); then + $( sleep $exit_monitor; disable_irq_latency ) & +fi diff --git a/ops/cpu/irq/latency/irq_latency_show.sh b/ops/cpu/irq/latency/irq_latency_show.sh new file mode 100755 index 0000000000000000000000000000000000000000..7da7cc8a8c68dcef083928e5112c1fea9f7fcf02 --- /dev/null +++ b/ops/cpu/irq/latency/irq_latency_show.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +while true +do + sleep 60 + echo "" + cat /proc/irq_latency/trace_dist + cat /proc/irq_latency/trace_stack +done diff --git a/ops/cpu/irq/latency/ops-help b/ops/cpu/irq/latency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..2c405e7249d3954f2ec02996917dc89c12a62fb2 --- /dev/null +++ b/ops/cpu/irq/latency/ops-help @@ -0,0 +1 @@ +IRQ latency diff --git a/ops/cpu/irq/latency/ops-run b/ops/cpu/irq/latency/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..9704b64d5924968636e81f36da83c3528a55f14f --- /dev/null +++ b/ops/cpu/irq/latency/ops-run @@ -0,0 +1 @@ +irq_latency.sh \ No newline at end of file diff --git a/ops/cpu/irq/ops-help b/ops/cpu/irq/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..3051fc176a9abca61f977dc5b2e9330b9a6de23f --- /dev/null +++ b/ops/cpu/irq/ops-help @@ -0,0 +1 @@ +IRQ relative tools diff --git a/ops/cpu/irq/stat/irq_stat.py b/ops/cpu/irq/stat/irq_stat.py new file mode 100755 index 0000000000000000000000000000000000000000..b3fefcecd3c31ffa8d1a2fcd3045458da401a7c1 --- /dev/null +++ b/ops/cpu/irq/stat/irq_stat.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import argparse +import time + +def parse_args(): + parser = argparse.ArgumentParser(description='中断统计工具') + parser.add_argument('-c', '--cpu', type=int, default=-1, + help='指定要查看的CPU') + parser.add_argument('-b', '--bindings', action='store_true', + help='输出指定CPU的中断绑定信息') + parser.add_argument('-t', '--top', type=int, default=0, + help='查看一定时间内产生最多的中断') + parser.add_argument('-i', '--interval', type=float, default=1.0, + help='指定时间间隔') + return parser.parse_args() + +def print_bindings(cpu): + with open('/proc/interrupts') as f: + for line in f: + fields = line.split() + if len(fields) > 0 and fields[0].endswith('-%d' % cpu): + print(line.strip()) + +def print_top_interrupts(top, interval): + interrupts1 = read_interrupts() + time.sleep(interval) + interrupts2 = read_interrupts() + diffs = [(k, interrupts2[k] - interrupts1.get(k, 0)) for k in interrupts2] + diffs.sort(key=lambda x: x[1], reverse=True) + for i in range(min(top, len(diffs))): + print('%-4s %10d' % (diffs[i][0], diffs[i][1])) + +def read_interrupts(): + interrupts = {} + with open('/proc/interrupts') as f: + for line in f: + fields = line.split() + if len(fields) > 0 and fields[0].endswith(':'): + interrupts[fields[0][:-1]] = sum([int(x) if x.isdigit() else 0 for x in fields[1:]]) + return interrupts + +def main(): + args = parse_args() + if args.cpu >= 0 and args.bindings: + print_bindings(args.cpu) + elif args.top > 0: + while True: + print_top_interrupts(args.top, args.interval) + else: + interrupts = read_interrupts() + for irq in interrupts: + print('%-4s %10d' % (irq, interrupts[irq])) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ops/cpu/irq/stat/ops-help b/ops/cpu/irq/stat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b921bcc6d0294784582e2616b3359440272c3313 --- /dev/null +++ b/ops/cpu/irq/stat/ops-help @@ -0,0 +1 @@ +IRQ statistics diff --git a/ops/cpu/irq/stat/ops-run b/ops/cpu/irq/stat/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..4b0749dc25911c46f93c82170a65fc88a3c60b0c --- /dev/null +++ b/ops/cpu/irq/stat/ops-run @@ -0,0 +1 @@ +irq_stat.py \ No newline at end of file diff --git a/ops/cpu/ops-help b/ops/cpu/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..4f01aafa4c85f2b82fba921c2de1fa2e0a0da7a9 --- /dev/null +++ b/ops/cpu/ops-help @@ -0,0 +1 @@ +CPU relative tools diff --git a/ops/cpu/runqlat/ops-help b/ops/cpu/runqlat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a7140005e66458814011bf7c4e8b3870aba84e83 --- /dev/null +++ b/ops/cpu/runqlat/ops-help @@ -0,0 +1 @@ +eBPF: summarize run queue (scheduler) latency as a histogram, see helps: t-ops fs runqlat -h; driverd by eBPF diff --git a/ops/cpu/runqlat/ops-run b/ops/cpu/runqlat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..2164ef68663a54e6144738c6c3eaadd53cfbded0 --- /dev/null +++ b/ops/cpu/runqlat/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops cpu runqlat -h${NC}\n" + /usr/share/bcc/tools/runqlat $@ +} + +main $* diff --git a/ops/cpu/runqlen/ops-help b/ops/cpu/runqlen/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..133bdf9b372b4e889880f3a041afa3211c9f0ac1 --- /dev/null +++ b/ops/cpu/runqlen/ops-help @@ -0,0 +1 @@ +eBPF: summarize scheduler run queue length as a histogram, see helps: t-ops fs runqlen -h; driverd by eBPF diff --git a/ops/cpu/runqlen/ops-run b/ops/cpu/runqlen/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..3aa1ce32d8104ce765fbbe44fb39cab7679f253a --- /dev/null +++ b/ops/cpu/runqlen/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs runqlen -h${NC}\n" + /usr/share/bcc/tools/runqlen $@ +} + +main $* diff --git a/ops/cpu/runqslower/ops-help b/ops/cpu/runqslower/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..0d952e22aee5020d230891ecb7b054d8f424a5a6 --- /dev/null +++ b/ops/cpu/runqslower/ops-help @@ -0,0 +1 @@ +eBPF: trace high run queue latency, see helps: t-ops fs runqslower -h; driverd by eBPF diff --git a/ops/cpu/runqslower/ops-run b/ops/cpu/runqslower/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..2c43f7b510bdac8993ea059d8df5bbba22679ef9 --- /dev/null +++ b/ops/cpu/runqslower/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs runqslower -h${NC}\n" + /usr/share/bcc/tools/runqslower $@ +} + +main $* diff --git a/ops/cpu/sysmonitor/Readme.md b/ops/cpu/sysmonitor/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..1a031bf4c3c3725c47d5fc091fee187c848b42e3 --- /dev/null +++ b/ops/cpu/sysmonitor/Readme.md @@ -0,0 +1,18 @@ +# compiler +gcc +# perf configuration +``` +su root +echo -1 > /proc/sys/kernel/perf_event_paranoid +echo 0 > /proc/sys/kernel/kptr_restrict +exit +``` +# usage +``` +sysmonitor [-m maxsys] [-c cpu] [-i interval] [-f outfile] [-l lasttime] + -m, 设置要监控的sys值,大于这个值运行perf进行数据采集,默认20. + -c, 设置要监控的cpu,默认是监控整体cpu的sys util. + -i, 监控sys时,每次扫描的间隔时长,单位秒. + -f,输出信息的存放文件,默认是./perf.data. + -l, perf数据采集时长\n" ,cmd); +``` diff --git a/ops/cpu/sysmonitor/ops-help b/ops/cpu/sysmonitor/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..d12f4b25c8ca02294df9443526f916c5666231af --- /dev/null +++ b/ops/cpu/sysmonitor/ops-help @@ -0,0 +1 @@ +sysmonitor relative tools diff --git a/ops/cpu/sysmonitor/ops-run b/ops/cpu/sysmonitor/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..87d387a70f56fec3943321962f844510e26dcf19 --- /dev/null +++ b/ops/cpu/sysmonitor/ops-run @@ -0,0 +1 @@ +sysmonitor.py \ No newline at end of file diff --git a/ops/cpu/sysmonitor/sysmonitor.py b/ops/cpu/sysmonitor/sysmonitor.py new file mode 100755 index 0000000000000000000000000000000000000000..c349b014b9a6de67409ab40089c952564c392135 --- /dev/null +++ b/ops/cpu/sysmonitor/sysmonitor.py @@ -0,0 +1,157 @@ +#! /usr/bin/python3 + +import sys +import os +import signal +from time import sleep + +class cpu_info: + def __init__(self): + self.utime=0; + self.ntime=0; + self.stime=0; + self.itime=0; + self.iowtime=0; + self.irqtime=0; + self.sirqtime=0; + def clone(self,cpu): + self.utime=cpu.utime; + self.ntime=cpu.ntime; + self.stime=cpu.stime; + self.itime=cpu.itime; + self.iowtime=cpu.iowtime; + self.irqtime=cpu.irqtime; + self.sirqtime=cpu.sirqtime; + + +sysCpuMax=20.0; #20% +cpuNum=0; #all cpu +flushTime=1; #1s +filePath="./perf.data"; #./perf.data +lastTime=10; #10s + +old_cpu=cpu_info() +new_cpu=cpu_info() +usrCpu=0.0 +sysCpu=0.0 +iowCpu=0.0 +irqCpu=0.0 + +def usage(cmd): + print(f"Usage: {cmd} [-m maxsys] [-c cpu] [-i interval] [-f outfile] [-l lasttime]\n \ + -m, 设置要监控的sys值,大于这个值运行perf进行数据采集,默认20.\n \ + -c, 设置要监控的cpu,默认是监控整体cpu的sys util.\n \ + -i, 监控sys时,每次扫描的间隔时长,单位秒.\n \ + -f,输出信息的存放文件,默认是./perf.data.\n \ + -l, perf数据采集时长\n") + +def handler(signum, frame): + exit(0) + +def read_cpu(cpuNum): + global old_cpu,new_cpu + old_cpu.clone(new_cpu) + with open("/proc/stat", "r",encoding="utf-8") as file: + for i in range (0,cpuNum+1): + line=file.readline() + file.close() + res=line.replace(' ',' ').split(' ') + new_cpu.utime=int(res[1],10) + new_cpu.ntime=int(res[2],10) + new_cpu.stime=int(res[3],10) + new_cpu.itime=int(res[4],10) + new_cpu.iowtime=int(res[5],10) + new_cpu.irqtime=int(res[6],10) + new_cpu.sirqtime=int(res[7],10) + +def cal_cpu(): + total_delta_time = (new_cpu.utime + new_cpu.ntime + new_cpu.stime + new_cpu.itime \ + + new_cpu.iowtime + new_cpu.irqtime + new_cpu.sirqtime) \ + - (old_cpu.utime + old_cpu.ntime + old_cpu.stime + old_cpu.itime \ + + old_cpu.iowtime + old_cpu.irqtime + old_cpu.sirqtime) + global usrCpu,sysCpu,iowCpu,irqCpu + usrCpu=(((new_cpu.utime + new_cpu.ntime) - (old_cpu.utime + old_cpu.ntime)) * 100 / total_delta_time) + sysCpu=(((new_cpu.stime ) - (old_cpu.stime)) * 100 / total_delta_time); + iowCpu=(((new_cpu.iowtime) - (old_cpu.iowtime)) * 100 / total_delta_time) + irqCpu=(((new_cpu.irqtime + new_cpu.sirqtime) - (old_cpu.irqtime + old_cpu.sirqtime)) * 100 / total_delta_time) + + +if __name__=="__main__": + + argc=len(sys.argv) + cpu=os.cpu_count() + signal.signal(signal.SIGINT, handler) + + i=1 + while(i= argc): + print("Option -m expects an argument.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + sysCpuMax = float(sys.argv[i]) + i=i+1 + continue + if (sys.argv[i]=="-c"): + if (i + 1 >= argc): + print("Option -d expects an argument.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + cpuNum = int(sys.argv[i],10) + if(cpuNum>cpu): + print("cpu number excess {cpu}.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + continue + if (sys.argv[i]=="-i"): + if (i + 1 >= argc): + print("Option -s expects an argument.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + flushTime = int(sys.argv[i],10) + i=i+1 + continue + if (sys.argv[i]=="-f"): + if (i + 1 >= argc): + print("Option -f expects an argument.") + usage(sys.argv[0]) + exit(-1) + + i+=1 + filePath=sys.argv[i] + if not os.access(os.path.dirname(filePath),os.W_OK): + print(f"wrong filePath {filepath}.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + continue + if (sys.argv[i]=="-l"): + if (i + 1 >= argc): + print("Option -l expects an argument.") + usage(sys.argv[0]) + exit(-1) + i=i+1 + lastTime=int(sys.argv[i],10) + i=i+1 + continue; + if (sys.argv[i]=="-h"): + usage(sys.argv[0]) + exit(-1) + print(f"Invalid argument {sys.argv[i]}.") + usage(sys.argv[0]) + exit(-1) + + read_cpu(cpuNum) + while(True): + sleep(flushTime) + read_cpu(cpuNum) + cal_cpu() + print("usrCpu {:.2f}% sysCpu {:.2f}% iowCpu {:.2f}% irqCpu {:.2f}%".format(usrCpu,sysCpu,iowCpu,irqCpu)) + if(sysCpu>=sysCpuMax): + line=f"perf record -agq -o {filePath} -- sleep {lastTime}" + os.system(line) + break diff --git a/ops/fs/LICENSE b/ops/fs/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 --- /dev/null +++ b/ops/fs/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/ops/fs/README.md b/ops/fs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7b104d10b415514e595d4d537f2419d4fd359c16 --- /dev/null +++ b/ops/fs/README.md @@ -0,0 +1,31 @@ +# OpenCloudOS运维工具SIG +## 职责范围 Scope +OpenCloudOS作为一个操作系统项目,离不开运维工具的建设。本小组的目标致力于,将腾讯云的运维经验中积累的有较广泛价值工具,以开源的形式贡献给社区。主要工作包括对现有系统开源工具及命令的能力补充和新工具能力的开发和运营。 + +## 成员 +* 陈立东 +* 任益 +* 王子勇 +* 邹立巍 +* 潘睿 +* 朱小磊 + +## 会议制度 Meetings +双周例会 +## 联络方式 Contacts +● 邮件列表 SIG-ops@lists.opencloudos.org +## 如何加入基础设施SIG并参与贡献 + 1.注册GitHub账号 + 2.签署CLA + 3.找到对应安全SIG项目仓库地址: +## 所辖项目 Subprojects +1. ext4文件系统数据误删除恢复工具 +【简介】ext4文件系统的数据恢复工具。 + +2. xfs文件系统数据误删除恢复工具 +【简介】xfs文件系统的数据恢复工具。 + +3. 系统工具箱 +【简介】系统常用工具。 + + diff --git a/ops/fs/dirtop/ops-help b/ops/fs/dirtop/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a71fcb44657db1def349c15edbaaf92c13ba4cb0 --- /dev/null +++ b/ops/fs/dirtop/ops-help @@ -0,0 +1 @@ +eBPF: show rw numbers/bytes of one dir, see helps: t-ops fs dirtop -h; driverd by eBPF diff --git a/ops/fs/dirtop/ops-run b/ops/fs/dirtop/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..1335173571438ab82d0fdd431f5ea43f95b0c27b --- /dev/null +++ b/ops/fs/dirtop/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs dirtop -h${NC}\n" + /usr/share/bcc/tools/dirtop $@ +} + +main $* diff --git a/ops/fs/ext4_recover/CMakeLists.txt b/ops/fs/ext4_recover/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b382b0f01d6eba1312a331164046a4ea6df75681 --- /dev/null +++ b/ops/fs/ext4_recover/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required(VERSION 2.8) +project(ext4_recover) + +add_executable(ext4recover src/ext4recover.c) + +find_library(EXT2FS ext2fs) +find_library(COM_ERR com_err) +target_link_libraries(ext4recover ${EXT2FS}) +target_link_libraries(ext4recover ${COM_ERR}) \ No newline at end of file diff --git a/ops/fs/ext4_recover/Readme.md b/ops/fs/ext4_recover/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..3c01aa5cb67994d0715a848da8c28c0e067e14ad --- /dev/null +++ b/ops/fs/ext4_recover/Readme.md @@ -0,0 +1,52 @@ +# 项目介绍 +ext4文件系统误删除文件恢复工具 + +## 支持的平台 + +Linux ext4 + +# 快速上手 Getting Started + +## 环境要求 +Linux + +## 前提条件 + + +## 操作步骤 +### 使用方法 +``` +ext4recover /dev/sdx +# 假设/dev/sdx是要恢复数据的设备,执行上述命令后,会将能恢复的已删除文件恢复到RECOVER子目录下。 +``` +### 编译方法 +``` +yum install -y e2fsprogs-devel libcom_err-devel +cd src +make +``` + +## 常见问题 +#### 原理: +ext4在删除包含多级extent的文件时,会调用ext4_ext_rm_idx 删除下一级, 上一级本身只是eh_entries减一, +但是block位置没修改,同时inode外部的extent块信息清空后没有落盘。所以包含多级extent的文件删除后可以利用这些信息恢复。 +只包含一级extent时,只调用ext4_ext_rm_leaf,会修改对应index的eh_entries, 并且每一个entry的block位置被置0, 长度置0, +所以只包含一级extent的文件无法恢复. + +如果文件不存在碎片,单文件大于496M就会使用多级extent, 能恢复。 +当存在碎片的情况,文件就算很小也可能占用4个以上extent, 因此这些高度碎片化的小文件也能恢复 + +背景信息:ext4_ext_rm_idx删除的外部extent数据没落盘其实算是一个bug, 只不过当前内核社区还没修复, +brookxu同学提了补丁:https://lkml.org/lkml/2020/3/5/1248 + +本工具原始版本是zorrozou所写,curuwang从e2fsprogs代码中剥离并补充原理说明。 + +为什么说是bug呢? 因为实际上内核代码里面已经将叶子extent清空了( +rm后,直接dd读底层的extent,发现已清0, 但是dd使用iflag=direct绕过pagecache从块设备发现信息有残留, +证明只是把页面写了没有标记未脏,所以没落盘) + +文件被删除后, inode里面的信息发生了如下变化: +1. 文件大小,文件链接数变为0 +2. 文件dtime,ctime,atime,mtime被设置为删除时间 +3. 上述的extent信息变化 +4. 其他... diff --git a/ops/fs/ext4_recover/ops-help b/ops/fs/ext4_recover/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..bc2ad7e87dcb21a5fb2bc83afec9e1adc5c1652b --- /dev/null +++ b/ops/fs/ext4_recover/ops-help @@ -0,0 +1 @@ +ext4 recover tool: oc-ops fs ext4_recover -h diff --git a/ops/fs/ext4_recover/ops-run b/ops/fs/ext4_recover/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..f5f81f264c4d315b5301fee01399d62c2dd006e8 --- /dev/null +++ b/ops/fs/ext4_recover/ops-run @@ -0,0 +1,63 @@ +#!/bin/bash +############################################### +# File Name :ops-run +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +drop_caches=0 +mem_cost_topn=3 + +usage="\ +Usage: + oc-ops fs ext4_recover [-d dev] [-h] + COMMAND-LINE Options: + -d block dev to recover + -h help: recover ext4 fs +" + +function strstr() +{ + echo $1 + echo $2 + echo $1 | grep $2 +} + +function main() +{ + dir=$(dirname $0) + while getopts 'd;h' OPT; do + case $OPT in + d) dev="$OPTARG" + ;; + h) echo "$usage" + cat $dir/Readme.md + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done + sub=$(strstr "rpm -qa" "e2fsprogs-devel") + if [ -z "$sub" ]; then + yum install -y e2fsprogs-devel + fi + sub=$(strstr "rpm -qa" "libcom_err-devel") + if [ -z "$sub" ]; then + yum install -y libcom_err-devel + fi + cd $dir/src/ + make + $dir/src/ext4recover $dev + cd - + + } + + main $* diff --git a/ops/fs/ext4_recover/src/Makefile b/ops/fs/ext4_recover/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..cbdca72996a2b95f96aac4b414a494c881254aba --- /dev/null +++ b/ops/fs/ext4_recover/src/Makefile @@ -0,0 +1,10 @@ +all: ext4recover bgrep_demo + +ext4recover: ext4recover.c + $(CC) -Werror -o $@ $< -lcom_err -lext2fs + +bgrep_%: bgrep_%.c + $(CC) -Werror -o $@ $< + +clean: + rm -f ext4recover bgrep_demo bgrep_non_zero diff --git a/ops/fs/ext4_recover/src/bgrep.c b/ops/fs/ext4_recover/src/bgrep.c new file mode 100644 index 0000000000000000000000000000000000000000..3d70d3581460aa37805bfdacf288e7de2c9de214 --- /dev/null +++ b/ops/fs/ext4_recover/src/bgrep.c @@ -0,0 +1,44 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#define BUFLEN 4096 +int main(int argc, char **argv){ + int needle_len; + char *needle; + char buf[BUFLEN]; + char * f; + + if(argc < 4){ + printf("Usage: %s file_path bytes_len bytes_str\n", argv[0]); + exit(1); + } + setlinebuf(stdout); + + f = argv[1]; + needle_len = atoi(argv[2]); + needle = argv[3]; + + int fd = open(f, O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + printf("checkint match bytes from file %s\n", f); + + + size_t nread, blk_num=0; + ssize_t blk_start= -1; + char *p; + while (nread = read(fd, buf, BUFLEN)) { + if( (p = memmem(buf, nread, needle, needle_len)) != NULL){ + printf("found math at block %lu offset: %lu\n", blk_num, p-buf); + } + blk_num += 1; + } +} diff --git a/ops/fs/ext4_recover/src/bgrep_demo.c b/ops/fs/ext4_recover/src/bgrep_demo.c new file mode 100644 index 0000000000000000000000000000000000000000..aee4ae64dd7b2228ed9b6e7af34535ef442832bb --- /dev/null +++ b/ops/fs/ext4_recover/src/bgrep_demo.c @@ -0,0 +1,114 @@ +/* + * Copyright 2020 Tencent Inc. All rights reserved. + * Author: curuwang@tencent.com +*/ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#define BLOCK_SIZE 4096 +#define MAX_VALID_BLOCKS 131072 +#define RECOVER_DIR "./RECOVER" + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s file\n", argv[0]); + exit(1); + } + printf("creating recover directory '%s'\n", RECOVER_DIR); + if (mkdir(RECOVER_DIR, 0700) < 0) { + perror("failed to create recover dir:"); + return 1; + } + // fdx + char *match_fdx_start = "\x3f\xd7\x6c\x17\x1dLucene50StoredFieldsHighIndex"; + int match_fdx_len = 34; + // fdt + char *match_fdt1_start = "\x3f\xd7\x6c\x17\x1cLucene50StoredFieldsFastData"; + int match_fdt1_len = 33; + char *match_fdt2_start = "\x3f\xd7\x6c\x17\x1cLucene50StoredFieldsHighData"; + int match_fdt2_len = 33; + + char *match_fnm_start = "\x3f\xd7\x6c\x17\x12Lucene60FieldInfos"; + int match_fnm_len = 23; + char *match_si_start = "\x3f\xd7\x6c\x17\x13Lucene70SegmentInfo"; + int match_si_len = 24; + char *match_end = "\xc0\x28\x93\xe8\x00\x00\x00\x00"; + char match_end_len = 8; + char *f = argv[1]; + printf("recovering from %s\n", f); + + int fd = open(f, O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + size_t nread, blk_number; + char buf[BLOCK_SIZE]; + char wfilename[100]; + int wfd = 0; + char *p; + char *pend; + int started = 0; + size_t nblocks = 0; + while (nread = read(fd, buf, BLOCK_SIZE)) { + p = buf; + if (!started) { + if (((p = memmem(buf, nread, match_fdt1_start, match_fdt1_len)) == buf) || + ((p = memmem(buf, nread, match_fdt2_start, match_fdt2_len)) == buf)) { + sprintf(wfilename, "%s/%lu.fdt", RECOVER_DIR, blk_number); + } + /* + else if( (p=memmem(buf, nread, match_fdx_start, match_fdx_len)) == buf){ + sprintf(wfilename, "./recover/%lu.fdx", blk_number); + }else if( (p=memmem(buf, nread, match_fnm_start, match_fnm_len)) == buf){ + sprintf(wfilename, "./recover/%lu.fnm", blk_number); + }else if( (p=memmem(buf, nread, match_si_start, match_si_len)) == buf){ + sprintf(wfilename, "./recover/%lu.si", blk_number); + } + */ + + if (p == buf) { + started = 1; + nblocks = 0; + printf("found match file begin at block:%-10lu\n", blk_number); + printf("dump file to %s... begin\n", wfilename); + fflush(stdout); + wfd = open(wfilename, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (wfd < 0) { + perror("open"); + return 1; + } + } + } + + if (started) { + pend = memmem(buf, nread, match_end, match_end_len); + // found end str + if (pend != NULL) { + pend = pend + match_end_len + 8; + write(wfd, buf, pend - buf); + printf("found match end at block:%-10lu\n", blk_number); + close(wfd); + started = 0; + } else if (nblocks > MAX_VALID_BLOCKS) { + write(wfd, buf, nread); + printf("reach MAX_VALID_BLOCKS at block:%-10lu\n", blk_number); + close(wfd); + started = 0; + } else { + write(wfd, buf, nread); + nblocks += 1; + } + } + blk_number += 1; + } + close(fd); +} + diff --git a/ops/fs/ext4_recover/src/bgrep_foundationdb.c b/ops/fs/ext4_recover/src/bgrep_foundationdb.c new file mode 100644 index 0000000000000000000000000000000000000000..157076cfde40cbc655875c4b87f8d8f27cad3c3e --- /dev/null +++ b/ops/fs/ext4_recover/src/bgrep_foundationdb.c @@ -0,0 +1,134 @@ +/* + * Copyright 2020 Tencent Inc. All rights reserved. + * Author: curuwang@tencent.com +*/ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BLOCK_SIZE 4096 +#define MAX_VALID_BLOCKS 131072 +#define RECOVER_DIR "./RECOVER" + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s [max_file_size_in_bytes]\n", argv[0]); + exit(1); + } + + setlinebuf(stdout); + + int max_file_size = 0; + if (argc == 3 ){ + max_file_size = atoi(argv[2]); + printf("max file size: %d\n", max_file_size); + } + + printf("creating recover directory '%s'\n", RECOVER_DIR); + if (mkdir(RECOVER_DIR, 0700) < 0) { + perror("failed to create recover dir:"); + return 1; + } + // .sqlite + char *m1_start = "FoundationDB100\0"; + char *m1_suffix = ".sqlite"; + int m1_len = 16; + // processId + char *m2_start = "\x01\x00\x01\x70\xa5\x00\xdb\x0f"; + char *m2_suffix = ".processId"; + int m2_len = 8; + //.sqlite-wal + char *m3_start = "\x37\x7f\x06\x82\x00\x2d\xe2\x18\x00\x00\x10"; + char *m3_suffix = ".sqlite-wal"; + int m3_len = 11; + //.db: reeal sqlite v3 + char *m4_start = "SQLite format 3\0"; + char *m4_suffix = ".sqlite.db"; + int m4_len = 16; + + char *f = argv[1]; + printf("recovering from %s\n", f); + + int fd = open(f, O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + size_t nread, blk_number=0; + char buf[BLOCK_SIZE]; + char wfilename[100]; + int wfd = 0; + char *p; + char *pend; + int started = 0; + size_t nblocks = 0; + + uint64_t file_size = 0; + while (nread = read(fd, buf, BLOCK_SIZE)) { + if (!started) { + if ( (p = memmem(buf, nread, m1_start, m1_len)) == buf) { + sprintf(wfilename, "%s/%lu%s", RECOVER_DIR, blk_number, m1_suffix); + file_size = max_file_size; + } + else if ( (p = memmem(buf, nread, m2_start, m2_len)) == buf) { + sprintf(wfilename, "%s/%lu%s", RECOVER_DIR, blk_number, m2_suffix); + file_size = max_file_size; + } + else if ( (p = memmem(buf, nread, m3_start, m3_len)) == buf) { + sprintf(wfilename, "%s/%lu%s", RECOVER_DIR, blk_number, m3_suffix); + file_size = max_file_size; + } + else if ( (p = memmem(buf, nread, m4_start, m4_len)) == buf) { + sprintf(wfilename, "%s/%lu%s", RECOVER_DIR, blk_number, m4_suffix); + uint16_t page_size; + uint32_t page_count; + page_size = __bswap_16(*(uint16_t*)(p+16)); + page_count = __bswap_32(*(uint32_t*)(p+28)); + file_size = (uint64_t)page_count * page_size; + printf("page size %u, page count:%u, file size:%u\n", page_size, page_count, file_size); + if( max_file_size > 0 && file_size > max_file_size){ + file_size = max_file_size; + printf("truncate file to max_file_size:%d\n", max_file_size); + } + } + + if (p == buf) { + started = 1; + nblocks = 0; + printf("found match file begin at block:%-10lu\n", blk_number); + printf("dump file to %s... begin\n", wfilename); + fflush(stdout); + wfd = open(wfilename, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (wfd < 0) { + perror("open"); + return 1; + } + } + } + + if (started) { + if((nblocks+1)*BLOCK_SIZE >= file_size){ + size_t to_write = file_size - nblocks*BLOCK_SIZE; + write(wfd, buf, to_write); + printf("nwrite: %lu >= %lu: to_wirte:%lu\n", (nblocks+1)*BLOCK_SIZE, file_size, to_write); + printf("file end at block:%-10lu\n", blk_number); + close(wfd); + started = 0; + } else { + write(wfd, buf, nread); + nblocks += 1; + } + } + blk_number += 1; + } + close(fd); +} + diff --git a/ops/fs/ext4_recover/src/bgrep_non_zero.c b/ops/fs/ext4_recover/src/bgrep_non_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..2c7e7db94f73315e0cf80a5ed2501313fdcad055 --- /dev/null +++ b/ops/fs/ext4_recover/src/bgrep_non_zero.c @@ -0,0 +1,63 @@ +/* + * Copyright 2020 Tencent Inc. All rights reserved. + * Author: curuwang@tencent.com +*/ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BLOCK_SIZE 16*1024*1024 +#define MAX_VALID_BLOCKS 131072 +#define RECOVER_DIR "./RECOVER" + +int main(int argc, char **argv) { + if (argc < 3) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + + setlinebuf(stdout); + + char *f = argv[1]; + size_t block_size = atoi(argv[2]); + printf("checking %s with chunk: %d bytes\n", f, block_size); + + char *zero_block = malloc(block_size); + char *buf = malloc(block_size); + memset(zero_block, '\0', block_size); + + int fd = open(f, O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + size_t nread, blk_num=0; + ssize_t blk_start= -1; + while (nread = read(fd, buf, block_size)) { + if(memcmp(buf, zero_block, block_size) != 0){ + blk_start = blk_start > -1 ? blk_start : blk_num; + }else{ + if(blk_start > -1){ + printf("found non zero %d bytes block at: %-llu -> %-llu\n", block_size, blk_start, blk_num); + blk_start = -1; + } + } + blk_num += 1; + } + + if(blk_start > -1){ + printf("found non zero %d bytes block at: %-llu -> %-llu\n", block_size, blk_start, blk_num); + } + + close(fd); +} + diff --git a/ops/fs/ext4_recover/src/ext4recover.c b/ops/fs/ext4_recover/src/ext4recover.c new file mode 100644 index 0000000000000000000000000000000000000000..ec26b2e207bca76db9275a7029f81f1bb1f093e9 --- /dev/null +++ b/ops/fs/ext4_recover/src/ext4recover.c @@ -0,0 +1,392 @@ +/* + * History: + * 2020-03-01 - Creation by zorrozou + * 2020-03-08 - beta 0.1 + * 2020-08-05 - beta 0.2 split from e2fsprogs code by curuwang + */ + +#define _LARGEFILE_SOURCE +#define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RECOVER_DIR "./RECOVER" +#define VERSION "0.2b" + +static const char *program_name = "ext4recover"; +static char *device_name = NULL; +static int flag = 0; +static __u32 icount; +static struct ext3_extent_header *eh; +static struct ext3_extent_idx *ei; +static struct ext3_extent *ee; +static ssize_t blocksize; +static int recover_fd, device_fd; + +struct extent_path { + char *buf; + int entries; + int max_entries; + int left; + int visit_num; + int flags; + blk64_t end_blk; + void *curr; +}; + +struct ext2_extent_handle { + errcode_t magic; + ext2_filsys fs; + ext2_ino_t ino; + struct ext2_inode *inode; + struct ext2_inode inodebuf; + int type; + int level; + int max_depth; + int max_paths; + struct extent_path *path; +}; + +static int is_inode_extent_clear(struct ext2_inode *inode) { + errcode_t retval; + + eh = (struct ext3_extent_header *) inode->i_block; + ei = (struct ext3_extent_idx *) eh + 1; + ee = (struct ext3_extent *) eh + 2; + blk64_t blk; + blk = ext2fs_le32_to_cpu(ei->ei_leaf) + + ((__u64) ext2fs_le16_to_cpu(ei->ei_leaf_hi) << 32); + if (ei->ei_leaf > 1 && LINUX_S_ISREG(inode->i_mode) && inode->i_links_count == 0) { + fprintf(stdout, "inode: %u extent_block: %llu\n", icount, blk); + // not clear + return 0; + } + + return 1; +} + +static int recover_block_to_file(int devfd, int inofd, __le32 block, __le16 len, __u64 start) { + off_t offset_dev, offset_ino, offset; + ssize_t size, ret; + char buf[blocksize]; + int i; + + offset_dev = lseek(devfd, start * blocksize, SEEK_SET); + if (offset_dev < 0) { + perror("lseek(devfd)"); + return 0; + } + + offset_ino = lseek(inofd, block * blocksize, SEEK_SET); + if (offset_ino < 0) { + perror("lseek(inofd)"); + return 0; + } + + for (i = 0; i < len; i++) { + offset = 0; + size = 0; + while (offset < blocksize) { + size = read(devfd, buf + offset, blocksize - offset); + if (size < 0) { + lseek(devfd, offset_dev + blocksize, SEEK_SET); + perror("read(dev_fd)"); + continue; + } + offset += size; + } + //printf("before write offset: %llu\n", offset); + //printf("before write size: %llu\n", size); + while (offset != 0) { + ret = write(inofd, buf + (blocksize - offset), offset); + if (ret < 0) { + lseek(inofd, offset_ino + blocksize, SEEK_SET); + perror("write(inofd)"); + continue; + } + offset -= ret; + //printf("ret: %llu\n", ret); + } + //printf("after write offset: %llu\n", offset); + //printf("after write size: %llu\n", size); + } + + offset_dev = lseek(devfd, 0, SEEK_SET); + if (offset_dev < 0) { + perror("lseek(devfd)"); + return 0; + } + + offset_ino = lseek(inofd, 0, SEEK_SET); + if (offset_ino < 0) { + perror("lseek(inofd)"); + return 0; + } + return 1; +} + +static int dump_dir_extent(struct ext3_extent_header *eh) { + struct ext3_extent *ee; + int i; + __le32 ee_block; + __le16 ee_len; + __u64 ee_start; + char *buf; + __u32 headbuflen = 4; + int retval; + +/* + retval = ext2fs_get_mem(headbuflen, &buf); + if (retval) + return; +*/ + ee = EXT_FIRST_EXTENT(eh); +//printf("eh->eh_entries: %d\n", eh->eh_entries); +//printf("eh->eh_max: %d\n", eh->eh_max); + if (ext2fs_le16_to_cpu(eh->eh_entries) > 340) { + return 1; + } else if (ext2fs_le16_to_cpu(eh->eh_magic) != EXT3_EXT_MAGIC) { + return 1; + } else if (ext2fs_le16_to_cpu(eh->eh_max) != 340) { + return 1; + } + for (i = 1; i < eh->eh_entries + 1; i++) { + ee_block = ext2fs_le32_to_cpu(ee->ee_block); + ee_len = ext2fs_le32_to_cpu(ee->ee_len); + ee_start = (((__u64) ext2fs_le16_to_cpu(ee->ee_start_hi) << 32) + + (__u64) ext2fs_le32_to_cpu(ee->ee_start)); + printf("%u %u %u %llu\n", icount, ee_block, ee_len, ee_start); + fflush(stdout); + + retval = recover_block_to_file(device_fd, recover_fd, ee_block, ee_len, ee_start); + if (!retval) { + fprintf(stderr, "recover_block_to_file()\n"); + return 0; + } + + ee++; + } + return 1; +} + +static int extent_tree_travel(ext2_extent_handle_t handle, struct ext3_extent_header *eh) { + struct ext3_extent_header *next; + struct ext3_extent_idx *ei; + int i, retval; + char *buf; + blk64_t blk; + + //printf("Enter extent_tree_travel + if (eh->eh_depth == 0) { + //printf("dump_dir_extent\n"); + retval = dump_dir_extent(eh); + if (!retval) { + fprintf(stderr, "dump_dir_extent()\n"); + return retval; + } + } else if (eh->eh_depth <= 4) { + flag = 1; + //printf("eh->eh_depth < 4\n"); + for (i = 1; i < eh->eh_entries + 1; i++) { + retval = ext2fs_get_mem(blocksize, &buf); + if (retval) + return retval; + + memset(buf, 0, blocksize); + ei = EXT_FIRST_INDEX(eh) + i - 1; + blk = ext2fs_le32_to_cpu(ei->ei_leaf) + + ((__u64) ext2fs_le16_to_cpu(ei->ei_leaf_hi) << 32); + retval = io_channel_read_blk64(handle->fs->io, + blk, 1, buf); + if (retval) + return retval; + next = (struct ext3_extent_header *) buf; + //printf("Recursive\n"); + retval = extent_tree_travel(handle, next); + if (!retval) { + fprintf(stderr, "extent_tree_travel()\n"); + } + ext2fs_free_mem(&buf); + //printf("Recursive end\n"); + } + } else { + /* xxxxxxxxxxxxxxx */ + return 1; + } + return 1; +} + +static int prase_ino_extent(ext2_extent_handle_t handle) { + int i, retval; + struct ext3_extent_idx *ix; + struct ext3_extent_header *next; + char *buf; + blk64_t blk; + struct ext3_extent_header *eh; + + eh = (struct ext3_extent_header *) handle->inode->i_block; + +// printf("aaaaaaaaaaaaaa\n"); + retval = ext2fs_get_mem(blocksize, &buf); + if (retval) + return 0; +// printf("i:%d\n", i); +// printf("ix->ei_leaf: %d\n", ix->ei_leaf); +// printf("next->eh_max: %d\n", next->eh_max); +// printf("next->eh_entries: %d\n", next->eh_entries); + memset(buf, 0, blocksize); + for (i = 1; i <= 4; i++) { + ix = EXT_FIRST_INDEX(eh) + i - 1; + + + blk = ext2fs_le32_to_cpu(ix->ei_leaf) + + ((__u64) ext2fs_le16_to_cpu(ix->ei_leaf_hi) << 32); + retval = io_channel_read_blk64(handle->fs->io, + blk, 1, buf); + if (retval) + return retval; + + next = (struct ext3_extent_header *) buf; + //printf("bbbbbbbbbbbbbb\n"); + retval = extent_tree_travel(handle, next); + if (!retval) { + fprintf(stderr, "extent_tree_travel()"); + return retval; + } + //printf("cccccccccccccccccc\n"); + /* for last extent */ + if (next->eh_entries < 340) { +// printf("dddddddddddddddddddd\n"); + break; + } + }; + ext2fs_free_mem(&buf); + return 1; +} + +static int is_on_device(const char *path, const char *dev) { + struct stat stat1, stat2; + int ret; + ret = stat(path, &stat1); + if (ret < 0) { + perror("stat:"); + return -1; + } + ret = stat(dev, &stat2); + if (ret < 0) { + perror("stat:"); + return -1; + } + if (((stat2.st_mode & S_IFMT) == S_IFBLK) && (stat1.st_dev == stat2.st_rdev)) { + return 1; + } + return 0; +} + +int main(int argc, char **argv) { + errcode_t retval; + blk64_t use_superblock = 0; + ext2_filsys fs; + int use_blocksize = 0; + int flags; + __u32 imax; + struct ext2_inode inode; + ext2_extent_handle_t handle; + char filename[BUFSIZ]; + + if (argc != 2) { + fprintf(stderr, "Usage: %s /dev/xxx\n", argv[0]); + fprintf(stderr, "Recover deleted files using remaining extent info\n"); + fprintf(stderr, "version: %s\n\n", VERSION); + fprintf(stderr, "eg:\n\t%s /dev/vdb1\n", argv[0]); + exit(1); + } + + device_name = argv[1]; + flags = EXT2_FLAG_JOURNAL_DEV_OK | EXT2_FLAG_SOFTSUPP_FEATURES | + EXT2_FLAG_64BITS; + retval = ext2fs_open(device_name, flags, use_superblock, + use_blocksize, unix_io_manager, &fs); + + if (retval) { + com_err(program_name, retval, "while trying to open %s", + device_name); + printf("%s", "Couldn't find valid filesystem superblock.\n"); + exit(retval);; + } + fs->default_bitmap_type = EXT2FS_BMAP64_RBTREE; + + blocksize = fs->blocksize; + + device_fd = open(argv[1], O_RDONLY); + if (device_fd < 0) { + perror("open(device)"); + exit(1); + } + + retval = mkdir(RECOVER_DIR, 0750); + if (retval < 0 && errno != EEXIST) { + perror("mkdir()"); + exit(1); + } + if (is_on_device(RECOVER_DIR, device_name) == 1) { + fprintf(stderr, "DANGER: recover dir '%s' is on target device '%s', aborted!\n", + RECOVER_DIR, device_name); + exit(1); + } + imax = fs->super->s_inodes_count; + for (icount = 3; icount < imax + 1; icount++) { + flag = 0; + + // printf("%u\n", icount); + retval = ext2fs_read_inode(fs, icount, &inode); + if (retval) { + com_err(program_name, retval, "%s", + "while reading journal inode"); + continue; + } + if (is_inode_extent_clear(&inode)) { + continue; + } + + snprintf(filename, BUFSIZ, "%s/%d_file", RECOVER_DIR, icount); + + recover_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, 0640); + if (recover_fd < 0) { + perror("open(inode)"); + continue; + } + + retval = ext2fs_extent_open(fs, icount, &handle); + if (retval) + return retval; + + retval = prase_ino_extent(handle); + if (!retval) { + fprintf(stderr, "Recover error!\n"); + close(recover_fd); + close(device_fd); + exit(1); + } + if (flag) { + fflush(stdout); + } + + close(recover_fd); + } + + close(device_fd); + + fprintf(stderr, "Recover success!\n"); + + exit(0); +} + diff --git a/ops/fs/file/filelife/ops-help b/ops/fs/file/filelife/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..f55bc4fd30f924d9f87caf90cf42a81b43ceac77 --- /dev/null +++ b/ops/fs/file/filelife/ops-help @@ -0,0 +1 @@ +eBPF: show life of file, file name, file age, deleted by who, see helps: t-ops fs file filelife -h; driverd by eBPF diff --git a/ops/fs/file/filelife/ops-run b/ops/fs/file/filelife/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..f8790f615e5a0a761032671a00f564075e49b5ec --- /dev/null +++ b/ops/fs/file/filelife/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs file fileslower -h${NC}\n" + /usr/share/bcc/tools/filelife $@ +} + +main $* diff --git a/ops/fs/file/fileslower/ops-help b/ops/fs/file/fileslower/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..48bdeffaa415cd04d3223d11a02f90173df798cb --- /dev/null +++ b/ops/fs/file/fileslower/ops-help @@ -0,0 +1 @@ +eBPF: show latency and bytes of file sync io, which latency, see helps: t-ops fs file fileslower -h; driverd by eBPF diff --git a/ops/fs/file/fileslower/ops-run b/ops/fs/file/fileslower/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..bee0e9b35109190e06eaa3304fe9da6d9af9c495 --- /dev/null +++ b/ops/fs/file/fileslower/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs file fileslower -h${NC}\n" + /usr/share/bcc/tools/fileslower $@ +} + +main $* diff --git a/ops/fs/file/filetop/ops-help b/ops/fs/file/filetop/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b9fec1b0bb74b47bd586d4756a39ec385a8ae241 --- /dev/null +++ b/ops/fs/file/filetop/ops-help @@ -0,0 +1 @@ +eBPF: show files top of byte, see helps: t-ops fs file filetop -h; driverd by eBPF diff --git a/ops/fs/file/filetop/ops-run b/ops/fs/file/filetop/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..af83de5dc61f0954785b8aa529e735379b0e881a --- /dev/null +++ b/ops/fs/file/filetop/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs file filetop -h${NC}\n" + /usr/share/bcc/tools/filetop $@ +} + +main $* diff --git a/ops/fs/file/ops-help b/ops/fs/file/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..da77d690c7555c8d645bdb5952acccb531345340 --- /dev/null +++ b/ops/fs/file/ops-help @@ -0,0 +1 @@ +fs files tools dir, need enter subdir continue diff --git a/ops/fs/ops-help b/ops/fs/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b93aef9b334fb76ee2c5d65db3c7e0b01484d11f --- /dev/null +++ b/ops/fs/ops-help @@ -0,0 +1 @@ +fs recover tools: oc-ops fs -h diff --git a/ops/fs/readahead/ops-help b/ops/fs/readahead/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..bfd5e989438a29c8f96cc141f3a03736f58467ad --- /dev/null +++ b/ops/fs/readahead/ops-help @@ -0,0 +1 @@ +eBPF: show performance of read-ahead cache, see helps: t-ops fs readahead -h; driverd by eBPF diff --git a/ops/fs/readahead/ops-run b/ops/fs/readahead/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..77cf026248f18b4b31cacff066f61feaa6317d8c --- /dev/null +++ b/ops/fs/readahead/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs readahead -h${NC}\n" + /usr/share/bcc/tools/readahead $@ +} + +main $* diff --git a/ops/fs/xfs/ops-help b/ops/fs/xfs/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..29ff7c29f063d35b310c6038edd0ec57bf863619 --- /dev/null +++ b/ops/fs/xfs/ops-help @@ -0,0 +1 @@ +xfs tools diff --git a/ops/fs/xfs/xfs_recover/.gitignore b/ops/fs/xfs/xfs_recover/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fd131b6fde52a5a756461901105f506b6cc2885a --- /dev/null +++ b/ops/fs/xfs/xfs_recover/.gitignore @@ -0,0 +1,81 @@ +# object files +*.o +.dep +.ltdep + +# build system +.census +.gitcensus +/include/platform_defs.h +/include/builddefs +/install-sh + +# magic directory symlinks +/include/disk +/include/xfs + +# packaging +/doc/CHANGES.gz +/xfsprogs-* +/xfsprogs_* +/xfslibs-dev_* + +# autoconf generated files +/aclocal.m4 +/m4/libtool.m4 +/m4/ltoptions.m4 +/m4/ltsugar.m4 +/m4/ltversion.m4 +/m4/lt~obsolete.m4 +/autom4te.cache/ +/config.guess +/config.log +/config.status +/config.sub +/configure + +# libtool +/libtool +/ltmain.sh +*.lo +*.la +.libs + +# gettext +/po/de.mo +/po/pl.mo +/po/xfsprogs.pot + +# cscope stuff +cscope.* + +# quilt stuff +/.pc/ +/patches/ + +# binaries +/copy/xfs_copy +/db/xfs_db +/estimate/xfs_estimate +/fsr/xfs_fsr +/growfs/xfs_growfs +/io/xfs_io +/logprint/xfs_logprint +/mdrestore/xfs_mdrestore +/mkfs/fstyp +/mkfs/mkfs.xfs +/quota/xfs_quota +/repair/xfs_repair +/rtcp/xfs_rtcp +/spaceman/xfs_spaceman +/scrub/xfs_scrub +/scrub/xfs_scrub@.service +/scrub/xfs_scrub_all +/scrub/xfs_scrub_all.cron +/scrub/xfs_scrub_all.service +/scrub/xfs_scrub_fail@.service + +# generated crc files +/libfrog/crc32selftest +/libfrog/crc32table.h +/libfrog/gen_crc32table diff --git a/ops/fs/xfs/xfs_recover/Makefile b/ops/fs/xfs/xfs_recover/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c9927991a8e911a7d97fd62199dd1214330b9141 --- /dev/null +++ b/ops/fs/xfs/xfs_recover/Makefile @@ -0,0 +1,12 @@ +XFSPROGS_VERSION = 5.9.0 +XFSPROGS_DIR = xfsprogs-$(XFSPROGS_VERSION) + +default: $(XFSPROGS_DIR) + cp recover/* $(XFSPROGS_DIR)/db + make -C $(XFSPROGS_DIR) + cp $(XFSPROGS_DIR)/db/xfsrecover recover + +$(XFSPROGS_DIR): + curl -fsSL https://kernel.org/pub/linux/utils/fs/xfs/xfsprogs/xfsprogs-$(XFSPROGS_VERSION).tar.xz | tar -xJf - + cd ${XFSPROGS_DIR} && ./configure + diff --git a/ops/fs/xfs/xfs_recover/README.md b/ops/fs/xfs/xfs_recover/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a8fef71f1cb6df682c8d6938fb2f574ae1a6c46 --- /dev/null +++ b/ops/fs/xfs/xfs_recover/README.md @@ -0,0 +1,33 @@ +# Xfs文件系统误删除文件恢复工具使用 + +这是一个xfs误删除的恢复工具,使用方法: + +编译: +``` +make +``` + +如果make失败的话,需安装以下rpm包再make: +centos: +``` +yum install libtool +yum install libuuid libuuid-devel +yum install libblkid-devel +``` +ubuntu: +``` +apt get glibtoolize +apt install uuid-dev +apt install libblkid-dev +``` + +make成功后,在 recover目录下会编译生成 xfsrecover 可执行文件。 + +之后 cd 到你想恢复数据的目标目录,比如此处目标目录为 /data1 , xfs_recover目录为 /data1/xfs_recover ,执行: + +cd /data1 + +./xfs_recover/db/xfsrecover /dev/sdb1 # /dev/sdb1 为你想恢复数据的文件系统。 + +执行完毕后会在 /data1 下产生 RECOVER 目录,其中的文件就是恢复出来的数据。 + diff --git a/ops/fs/xfs/xfs_recover/ops-help b/ops/fs/xfs/xfs_recover/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..9e92c28eb7e6fb1d55c0a46090273855947f3915 --- /dev/null +++ b/ops/fs/xfs/xfs_recover/ops-help @@ -0,0 +1 @@ +xfs recover tool: oc-ops fs xfs_recover -h diff --git a/ops/fs/xfs/xfs_recover/ops-run b/ops/fs/xfs/xfs_recover/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..19d41a82bd40d972a298cb5f5c7d47bd28fa506b --- /dev/null +++ b/ops/fs/xfs/xfs_recover/ops-run @@ -0,0 +1,68 @@ +#!/bin/bash +############################################### +# File Name :ops-run +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +drop_caches=0 +mem_cost_topn=3 + +usage="\ +Usage: + oc-ops fs xfs xfs_recover [-d dev] [-h] + COMMAND-LINE Options: + -d block dev to recover + -h help: recover xfs fs +" + +function strstr() +{ + echo $1 | grep $2 +} + +function main() + { + dir=$(dirname $0) + while getopts 'd;h' OPT; do + case $OPT in + d) dev="$OPTARG" + ;; + h) echo "$usage" + cat $dir/README.md + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done + sub=$(strstr "rpm -qa" "libtool") + if [ -z "$sub" ]; then + yum install -y libtool + fi + sub=$(strstr "rpm -qa" "libuuid") + if [ -z "$sub" ]; then + yum install -y libuuid + fi + sub=$(strstr "rpm -qa" "libuuid-devel") + if [ -z "$sub" ]; then + yum install -y libuuid-devel + fi + sub=$(strstr "rpm -qa" "libblkid-devel") + if [ -z "$sub" ]; then + yum install -y libblkid-devel + fi + cd $dir + make + $dir/recover/xfsrecover $dev + cd - + } + + main $* diff --git a/ops/fs/xfs/xfs_recover/recover/Makefile b/ops/fs/xfs/xfs_recover/recover/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..af7dbe1ad7f41ac00b18e0e79a5e84045667bdb9 --- /dev/null +++ b/ops/fs/xfs/xfs_recover/recover/Makefile @@ -0,0 +1,27 @@ +# + +TOPDIR = .. +include $(TOPDIR)/include/builddefs + +LTCOMMAND = xfsrecover + +HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \ + btblock.h bmroot.h check.h command.h crc.h debug.h \ + dir2.h dir2sf.h dquot.h echo.h faddr.h field.h \ + flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \ + io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \ + sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \ + fuzz.h +CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c + +LLDLIBS = $(LIBXFS) $(LIBXLOG) $(LIBFROG) $(LIBUUID) $(LIBRT) $(LIBPTHREAD) +LTDEPENDENCIES = $(LIBXFS) $(LIBXLOG) $(LIBFROG) +LLDFLAGS += -static-libtool-libs + +default: depend $(LTCOMMAND) + +include $(BUILDRULES) + +install-dev: + +-include .dep diff --git a/ops/fs/xfs/xfs_recover/recover/init.c b/ops/fs/xfs/xfs_recover/recover/init.c new file mode 100644 index 0000000000000000000000000000000000000000..356d3129c92403e6c8e3748a5902ae9b6bc1a446 --- /dev/null +++ b/ops/fs/xfs/xfs_recover/recover/init.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) Tencent, Inc. + * All Rights Reserved. + * Create add Edit by zorrozou 2020/03/30. + * update by curuwang 2022/01/18 + * add recover dir by jindazhong 2022/04/12 + */ + +#define _LARGEFILE_SOURCE +#define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 + +#include "libxfs.h" +#include "libxlog.h" +#include "init.h" +#include "bmap.h" +#define RECOVER_DIR "./RECOVER" +#define FILE_DIR_HASH_SIZE 10000 +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +// path /a/b/c/d max size +#define FILE_PATH_MAX 1024 + +char *fsdevice; +static int recover_fd, device_fd; +int blkbb; +int exitcode; +int expert_mode; +static int force; +static struct xfs_mount xmount; +struct xfs_mount *mp; +static struct xlog xlog; +xfs_agnumber_t cur_agno = NULLAGNUMBER; +xfs_dinode_t *dinode; +struct xfs_sb *sbp; + +struct inode_file_list file_list; +struct hlist_head file_dir_htable[FILE_DIR_HASH_SIZE]; + + +typedef struct inode_file_list{ + __u64 inumber; + struct list_head list; +} inode_file_list; + +typedef struct file_dir_hash_node { + __u64 inumber; + uint8_t file_type; + uint8_t file_name_len; + char* file_name; + __u64 parent_inumber; + struct hlist_node list; +} file_dir_hash_node; + +static int recover_block_to_file(int devfd, int inofd, __u64 block, __u64 len, __u64 start) +{ + off_t offset_dev, offset_ino, offset; + ssize_t size, ret; + char buf[sbp->sb_blocksize]; + int i; + + offset_dev = lseek(devfd, start * sbp->sb_blocksize, SEEK_SET); + if (offset_dev < 0) { + perror("lseek(devfd)"); + return 0; + } + + offset_ino = lseek(inofd, block * sbp->sb_blocksize, SEEK_SET); + if (offset_ino < 0) { + perror("lseek(inofd)"); + return 0; + } + + for (i = 0; i < len; i++) { + offset = 0; + size = 0; + while (offset < sbp->sb_blocksize) { + size = read(devfd, buf + offset, sbp->sb_blocksize - offset); + if (size < 0) { + lseek(devfd, offset_dev + sbp->sb_blocksize, SEEK_SET); + perror("read(dev_fd)"); + continue; + } + offset += size; + } + //printf("before write offset: %llu\n", offset); + //printf("before write size: %llu\n", size); + while (offset != 0) { + ret = write(inofd, buf + (sbp->sb_blocksize - offset), offset); + if (ret < 0) { + lseek(inofd, offset_ino + sbp->sb_blocksize, SEEK_SET); + perror("write(inofd)"); + continue; + } + offset -= ret; + //printf("ret: %llu\n", ret); + } + //printf("after write offset: %llu\n", offset); + //printf("after write size: %llu\n", size); + } + + offset_dev = lseek(devfd, 0, SEEK_SET); + if (offset_dev < 0) { + perror("lseek(devfd)"); + return 0; + } + + offset_ino = lseek(inofd, 0, SEEK_SET); + if (offset_ino < 0) { + perror("lseek(inofd)"); + return 0; + } + return 1; +} + +static char *xfs_get_block(xfs_filblks_t blknum, uint32_t bsize) +{ + char *buf; + ssize_t size; + int fd; + off_t offset; + + fd = libxfs_device_to_fd(mp->m_ddev_targp->dev); + + buf = malloc(bsize); + if (buf == NULL) { + fprintf(stderr, "xfs_get_block: malloc() error!\n"); + return NULL; + } + + offset = lseek(fd, blknum * bsize, SEEK_SET); + if (offset < 0) { + perror("xfs_get_block: lseek()"); + return NULL; + } + + size = read(fd, buf, bsize); + if (size < 0) { + perror("xfs_get_block: read()"); + return NULL; + } + + return buf; +} + + +static int is_inode_used(xfs_dinode_t *dinode) +{ + //if (dinode->di_atime.t_sec == 0 || + // dinode->di_atime.t_nsec == 0) { + // dinode->di_nlink > 0) { + if (dinode->di_size != 0){ + return 1; + } + return 0; +} + +static int btree_block_travel(struct xfs_btree_block *block) +{ + int count, state, num, retval; + char *buf; + xfs_bmbt_rec_t *rec; + xfs_fileoff_t startoff; + xfs_fsblock_t startblock; + xfs_filblks_t blknum; + xfs_filblks_t blockcount; + xfs_bmbt_ptr_t *pp; + + if (be32_to_cpu(block->bb_magic) != XFS_BMAP_CRC_MAGIC) { + printf("block->bb_magic: %x, block not a bmap!\n", be32_to_cpu(block->bb_magic)); + return 0; + } + + if (be16_to_cpu(block->bb_level) == 0) { + /* leaf */ + num = be16_to_cpu(block->bb_numrecs); + printf("block->bb_numrecs: %d\n", num); + /* + buf = block; + for (i=1; i <= 4096;i++) { + printf("%.2x ", buf[i-1]); + if (i % 16 == 0) { + printf("\n"); + } + } + */ + rec = (xfs_bmbt_rec_t * )((struct xfs_btree_block *) block + 1); + for (count = 0; count < num; count++) { + convert_extent(rec + count, &startoff, &startblock, &blockcount, &state); + + /* + printf("rec l0: %16llx\t", (rec+count)->l0); + printf("rec l1: %16llx\n", (rec+count)->l1); + printf("addr rec: %p\n", rec+count); + */ + + printf("%lu %lu %lu\n", startoff, startblock, blockcount); + retval = recover_block_to_file(device_fd, recover_fd, startoff, blockcount, startblock); + if (!retval) { + fprintf(stderr, "recover_block_to_file(btree_block_travel)\n"); + return 0; + } + } + } else if (be16_to_cpu(block->bb_level) > 0 && + be16_to_cpu(block->bb_level) <= 5) { + /* middle */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[0]); + for (count = 0; count < be16_to_cpu(block->bb_numrecs); count++) { + blknum = cpu_to_be64(*pp + count); + printf("blknum: %lu\n", blknum); + buf = xfs_get_block(blknum, sbp->sb_blocksize); + if (buf == NULL) { + return 0; + } + btree_block_travel((struct xfs_btree_block *) buf); + free(buf); + } + } else { + /* do nothing! */ + } + + return 1; +} + + +static int inode_extent_tree_travel(xfs_bmdr_block_t *rblock, int iflag) +{ + int state, fsize, i, retval; + char *buf; + xfs_fileoff_t startoff; + xfs_fsblock_t startblock; + xfs_filblks_t blockcount, blknum; + xfs_bmbt_rec_t *rec; + xfs_bmbt_ptr_t *pp; + xfs_buf_t *bp; + xfs_agnumber_t agno; + + + if (rblock == NULL) { + return 0; + } + + if (cpu_to_be16(rblock->bb_level) == 0) { + /* extent fmt or leaf */ + if (iflag == 1) { + /* extent */ + rec = (xfs_bmbt_rec_t *) rblock; + } else { + /* leaf */ + rec = (xfs_bmbt_rec_t * )(rblock + 1); + } + convert_extent(rec, &startoff, &startblock, &blockcount, &state); + while (1) { + convert_extent(rec, &startoff, &startblock, &blockcount, &state); + if (blockcount == 0) { + break; + } + + agno = XFS_FSB_TO_AGNO(mp, startblock); + startblock = XFS_FSB_TO_DADDR(mp, startblock) >> mp->m_blkbb_log; + printf("startoff:%-10lu\tstartblock:%-10lu\tblockcount:%-10lu\tag:%u\n", + startoff, startblock, blockcount, agno); + retval = recover_block_to_file(device_fd, recover_fd, startoff, blockcount, startblock); + if (!retval) { + fprintf(stderr, "recover_block_to_file(inode_extent_tree_travel)\n"); + return 0; + } + rec++; + } + } else { + /* btree fmt: root */ + if (cpu_to_be16(rblock->bb_level) > 0 && cpu_to_be16(rblock->bb_numrecs) < 10) { + /* root */ + bp = malloc(sizeof(xfs_buf_t)); + fsize = XFS_DFORK_SIZE(dinode, mp, XFS_DATA_FORK); + if (fsize > sbp->sb_inodesize) { + return 0; + } + printf("root inode fsize: %u\n", fsize); + printf("bb_numrecs: %u\n", cpu_to_be16(rblock->bb_numrecs)); + pp = XFS_BMDR_PTR_ADDR(rblock, 1, libxfs_bmdr_maxrecs(fsize, 0)); + for (i = 0; i < cpu_to_be16(rblock->bb_numrecs); i++) { + blknum = cpu_to_be64(*pp + i); + printf("blknum: %lu\n", blknum); + buf = xfs_get_block(blknum, sbp->sb_blocksize); + if (buf == NULL) { + fprintf(stderr, "inode_extent_tree_travel: btree_block_travel() error!\n"); + return 0; + } + btree_block_travel((struct xfs_btree_block *) buf); + /* + libxfs_readbufr(mp->m_ddev_targp, blknum, bp, sbp->sb_blocksize, 0); + inode_extent_tree_travel(bp->b_addr, 0); + */ + free(buf); + } + free(bp); + } else { + /* do nothing! */ + } + } + + return 1; +} + +static inline int hlist_empty(const struct hlist_head *h){ + return !h->first; +} + +static struct file_dir_hash_node* search_file_dir_htable(__u64 inumber){ + struct file_dir_hash_node *data_node = NULL; + struct hlist_node *hlist; + __u64 key; + + key = inumber % FILE_DIR_HASH_SIZE; + if(hlist_empty(&file_dir_htable[key])) + return NULL; + else{ + hlist_for_each_entry(data_node, hlist, &file_dir_htable[key], list){ + if(data_node->inumber == inumber){ + return data_node; + } + + } + } + return NULL; +} + +static int insert_file_dir_htable(__u64 inumber, char *name, uint8_t namelen, uint8_t dir_ftype, __u64 parent_inumber){ + __u64 key; + struct file_dir_hash_node *data_node; + data_node = malloc(sizeof(struct file_dir_hash_node)); + if(!data_node){ + perror("malloc error struct file_dir_hash_node "); + return -1; + } + + data_node->inumber = inumber; + data_node->file_name = malloc(namelen+1); + memcpy(data_node->file_name, name, namelen); + data_node->file_name[namelen] = '\0'; + data_node->file_name_len = namelen; + data_node->file_type = dir_ftype; + data_node->parent_inumber = parent_inumber; + INIT_HLIST_NODE(&data_node->list); + + key = data_node->inumber % FILE_DIR_HASH_SIZE; + hlist_add_head(&data_node->list, &file_dir_htable[key]); + return 0; +} + +static int dir_travel(char * buf, uint16_t isize, uint16_t ipblock){ + struct xfs_dir2_data_hdr *block; + xfs_dir2_block_tail_t *btp = NULL; + xfs_dir2_leaf_entry_t *lep = NULL; + int i,j; + struct xfs_dir2_data_hdr *data; + xfs_dir2_data_entry_t *dep; + char *ptr; + char *endptr; + xfs_ino_t lino; + struct xfs_dir2_sf_hdr *sf; + xfs_dir2_sf_entry_t *sfe; + uint8_t dir_ftype; + __u64 parent_inumber = 0; + + data = (struct xfs_dir2_data_hdr*) buf; + block = (struct xfs_dir2_data_hdr*) buf; + //recover block dir + if (be32_to_cpu(block->magic) == XFS_DIR2_BLOCK_MAGIC || + be32_to_cpu(data->magic) == XFS_DIR2_DATA_MAGIC || + be32_to_cpu(block->magic) == XFS_DIR3_BLOCK_MAGIC || + be32_to_cpu(data->magic) == XFS_DIR3_DATA_MAGIC) { + + ptr = (char *)data + mp->m_dir_geo->data_entry_offset; + if (be32_to_cpu(block->magic) == XFS_DIR2_BLOCK_MAGIC || + be32_to_cpu(block->magic) == XFS_DIR3_BLOCK_MAGIC) { + btp = xfs_dir2_block_tail_p(mp->m_dir_geo, block); + lep = xfs_dir2_block_leaf_p(btp); + endptr = (char *)lep; + } else{ + endptr = (char *)data + mp->m_dir_geo->blksize; + } + while (ptr < endptr) { + dep = (xfs_dir2_data_entry_t *)ptr; + dir_ftype = xfs_dir2_data_get_ftype(mp, dep); + if ( dep->namelen != 0 && dir_ftype != 0 && be64_to_cpu(dep->inumber) != 0 ) { + if (parent_inumber == 0 && dep->name[0] == '.' && dep->namelen == 1){ + parent_inumber = be64_to_cpu(dep->inumber); + ptr += libxfs_dir2_data_entsize(mp, dep->namelen); + continue; + } + if (dep->name[0] == '.' && dep->name[1] == '.' && dep->namelen == 2){ + ptr += libxfs_dir2_data_entsize(mp, dep->namelen); + continue; + } + //printf("name:%s, namelen:%d,inode:%u,filetype:%u, parent_inumber:%u\n", dep->name, dep->namelen, be64_to_cpu(dep->inumber), dir_ftype, parent_inumber); + insert_file_dir_htable(be64_to_cpu(dep->inumber), (char*)dep->name, dep->namelen, dir_ftype, parent_inumber); + } + ptr += libxfs_dir2_data_entsize(mp, dep->namelen); + } + } + + //recover inode dir + dinode = (xfs_dinode_t *) buf; + for (i = 0; i < ipblock; i++) { + dinode = (xfs_dinode_t * ) & buf[isize * i]; + if (cpu_to_be16(dinode->di_magic) != XFS_DINODE_MAGIC) { + continue; + } + //if (dinode->di_format == XFS_DINODE_FMT_LOCAL){ + if ( 1==1 ){ + //if (be64_to_cpu(dinode->di_ino) == 67){ + sf = (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dinode); + sfe = xfs_dir2_sf_firstentry(sf); + //sf->count rewrite 0? + for (j = sf->count - 1; j >= -256; j--) { + lino = libxfs_dir2_sf_get_ino(mp, sf, sfe); + dir_ftype = libxfs_dir2_sf_get_ftype(mp, sfe); + if ( dir_ftype == 0 || lino == 0 ){ + break; + } + //printf("file:%s, inode:%u,dir_ftype:%u,parent_inode:%u\n",sfe->name, lino, dir_ftype, be64_to_cpu(dinode->di_ino)); + insert_file_dir_htable(lino, (char*)sfe->name, sfe->namelen, dir_ftype, be64_to_cpu(dinode->di_ino)); + sfe = libxfs_dir2_sf_nextentry(mp, sf, sfe); + } + } + } + return 0; +} + +static int _mkdir(const char *dir) { + char tmp[256]; + char *p = NULL; + size_t len; + snprintf(tmp, sizeof(tmp),"%s",dir); + len = strlen(tmp); + if(tmp[len - 1] == '/'){ + tmp[len - 1] = 0; + } + for(p = tmp + 1; *p; p++){ + if(*p == '/') { + *p = 0; + mkdir(tmp, 0777); + *p = '/'; + } + } + return mkdir(tmp, 0777); +} + +static int mv_or_cp_file(__u64 inumber, char *file_path, char *file_name){ + int retval; + char recover_file_path[FILE_PATH_MAX+10]; + char recover_file[FILE_PATH_MAX+256]; + char inode_file[128]; + snprintf(inode_file, 128, "%s/%llu_file", RECOVER_DIR, inumber); + snprintf(recover_file_path, FILE_PATH_MAX+10, "%s/%s", RECOVER_DIR, file_path); + snprintf(recover_file, FILE_PATH_MAX+256, "%s/%s", recover_file_path, file_name); + if ( strlen(file_name) == 0 ){ + return -1; + } + if (access(recover_file_path, 0) != 0){ + retval = _mkdir(recover_file_path); + if (retval < 0 && errno != EEXIST) { + perror("mkdir() recover_file_path"); + } + } + //mv + if (access(inode_file, 0) != 0){ + perror("inode_file is not exist"); + return -1; + } + + //printf("inode_file:%s, recover_file:%s\n",inode_file, recover_file); + if (rename(inode_file, recover_file) != 0){ + perror("rename error"); + return -1; + } + + //cp todo + return 0; +} + +static void recover_dir(){ + struct inode_file_list *file_list_node; + struct file_dir_hash_node *data_node,*tmp_node; + char *file_path, *file_path_tmp; + file_path = malloc(FILE_PATH_MAX); + file_path_tmp = malloc(FILE_PATH_MAX); + list_for_each_entry(file_list_node, &file_list.list, list){ + file_path = memset(file_path, '\0', FILE_PATH_MAX); + file_path_tmp = memset(file_path_tmp, '\0', FILE_PATH_MAX); + data_node = search_file_dir_htable(file_list_node->inumber); + if (!data_node){ + continue; + } + if (data_node->file_type == XFS_DIR3_FT_DIR){ + continue; + } + //printf("inode :%u, file:%s\n", file_list_node->inumber,data_node->file_name); + tmp_node = data_node; + while ( tmp_node->parent_inumber != sbp->sb_rootino && tmp_node->parent_inumber != 0 ){ + tmp_node = search_file_dir_htable(tmp_node->parent_inumber); + if (!tmp_node){ + break; + } + if (tmp_node->file_type !=XFS_DIR3_FT_DIR){ + break; + } + snprintf(file_path_tmp, FILE_PATH_MAX, "%s/%s",tmp_node->file_name, file_path); + memcpy(file_path, file_path_tmp, strlen(file_path_tmp)); + } + printf("inode:%llu, path:%s, file_name:%s\n",data_node->inumber, file_path,data_node->file_name); + mv_or_cp_file(data_node->inumber, file_path, data_node->file_name); + } +} + +static int disk_traverse(char *device, uint32_t bsize, uint16_t isize, uint16_t ipblock) +{ + int i, fd; + char buf[bsize]; + ssize_t size, offset; + xfs_bmdr_block_t *rblock; + struct inode_file_list *file_list_node; + char filename[BUFSIZ]; + + fd = open(device, O_RDONLY | O_LARGEFILE); + if (fd < 0) { + perror("open(device)"); + return 0; + } + + offset = -1; + while ((size = read(fd, buf, bsize)) == bsize) { + dir_travel(buf, isize, ipblock); + offset++; + dinode = (xfs_dinode_t *) buf; + /*if (cpu_to_be16(dinode->di_magic) != XFS_DINODE_MAGIC) { + continue; + }*/ + for (i = 0; i < ipblock; i++) { + dinode = (xfs_dinode_t * ) & buf[isize * i]; + if (cpu_to_be16(dinode->di_magic) != XFS_DINODE_MAGIC) { + continue; + } + if (is_inode_used(dinode)) { + continue; + } + //printf("magic: %d\n", dinode->di_magic); + // printf("inode: %llu\n", be64_to_cpu(dinode->di_ino)); + // printf("inode offset: %lu:%d\n", offset, i); + + //snprintf(filename, BUFSIZ, "%s/%lu_%d_file", RECOVER_DIR, offset, i); + snprintf(filename, BUFSIZ, "%s/%llu_file", RECOVER_DIR, be64_to_cpu(dinode->di_ino)); + file_list_node = (struct inode_file_list *)malloc(sizeof(struct inode_file_list)); + file_list_node->inumber = be64_to_cpu(dinode->di_ino); + list_add(&(file_list_node->list), &(file_list.list)); + + recover_fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, 0644); + if (recover_fd < 0) { + perror("open(inode)"); + continue; + } + rblock = (xfs_bmdr_block_t *) XFS_DFORK_PTR(dinode, XFS_DATA_FORK); + //printf("rblock addr - inode addr: %lu\n", (uint64_t)rblock - (uint64_t)dinode); + inode_extent_tree_travel(rblock, 1); + close(recover_fd); + } + } + + if (size != bsize && size != 0) { + fprintf(stderr, "read(): read device error!\n"); + close(fd); + return 0; + } + + close(fd); + return 1; +} + +int main(int argc, char **argv) +{ + struct xfs_buf *bp; + unsigned int agcount; + int retval,i; + + setlinebuf(stdout); + + progname = basename(argv[0]); + + if (argc != 2) { + fprintf(stderr, "argument error!\n"); + exit(1); + } + + fsdevice = argv[1]; + if (!x.disfile) + x.volname = fsdevice; + else + x.dname = fsdevice; + + x.bcache_flags = CACHE_MISCOMPARE_PURGE; + if (!libxfs_init(&x)) { + fputs(_("\nfatal error -- couldn't initialize XFS library\n"), + stderr); + exit(1); + } + /* + * Read the superblock, but don't validate it - we are a diagnostic + * tool and so need to be able to mount busted filesystems. + */ + memset(&xmount, 0, sizeof(struct xfs_mount)); + libxfs_buftarg_init(&xmount, x.ddev, x.logdev, x.rtdev); + retval = -libxfs_buf_read_uncached(xmount.m_ddev_targp, XFS_SB_DADDR, + 1 << (XFS_MAX_SECTORSIZE_LOG - BBSHIFT), 0, &bp, NULL); + if (retval) { + fprintf(stderr, _("%s: %s is invalid (cannot read first 512 " + "bytes)\n"), progname, fsdevice); + exit(1); + } + + /* copy SB from buffer to in-core, converting architecture as we go */ + libxfs_sb_from_disk(&xmount.m_sb, bp->b_addr); + + sbp = &xmount.m_sb; + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + fprintf(stderr, _("%s: %s is not a valid XFS filesystem (unexpected SB magic number 0x%08x)\n"), + progname, fsdevice, sbp->sb_magicnum); + if (!force) { + fprintf(stderr, _("Use -F to force a read attempt.\n")); + exit(EXIT_FAILURE); + } + } + + agcount = sbp->sb_agcount; + mp = libxfs_mount(&xmount, sbp, x.ddev, x.logdev, x.rtdev, + LIBXFS_MOUNT_DEBUGGER); + if (!mp) { + fprintf(stderr, + _("%s: device %s unusable (not an XFS filesystem?)\n"), + progname, fsdevice); + exit(1); + } + mp->m_log = &xlog; + + + /* + * xfs_check needs corrected incore superblock values + */ + if (sbp->sb_rootino != NULLFSINO && + xfs_sb_version_haslazysbcount(&mp->m_sb)) { + int error = -libxfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) { + fprintf(stderr, + _("%s: cannot init perag data (%d). Continuing anyway.\n"), + progname, error); + } + } + + printf("agcount: %u\n", agcount); + printf("isize: %u\n", sbp->sb_inodesize); + printf("bsize: %u\n", sbp->sb_blocksize); + printf("rootinode: %lu\n", sbp->sb_rootino); + printf("inopblock: %u\n", sbp->sb_inopblock); + printf("sb_icount: %lu\n", sbp->sb_icount); + device_fd = open(fsdevice, O_RDONLY); + if (device_fd < 0) { + perror("open(device)"); + exit(1); + } + + retval = mkdir(RECOVER_DIR, 0777); + if (retval < 0 && errno != EEXIST) { + perror("mkdir()"); + exit(1); + } + + INIT_LIST_HEAD(&file_list.list); + for(i = 0; i < FILE_DIR_HASH_SIZE; i++){ + INIT_HLIST_HEAD(&file_dir_htable[i]); + } + disk_traverse(fsdevice, sbp->sb_blocksize, sbp->sb_inodesize, sbp->sb_inopblock); + + recover_dir(); + close(device_fd); + exit(0); +} diff --git a/ops/fs/xfs/xfsdist/ops-help b/ops/fs/xfs/xfsdist/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..9ccf441dc5a6f82ac4895c5a926c84989439d2d3 --- /dev/null +++ b/ops/fs/xfs/xfsdist/ops-help @@ -0,0 +1 @@ +eBPF: summarize XFS operation latency, see helps: t-ops fs xfs xfsdist -h; driverd by eBPF diff --git a/ops/fs/xfs/xfsdist/ops-run b/ops/fs/xfs/xfsdist/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d34e5836b9794028021613fd6bacd44727ad0a25 --- /dev/null +++ b/ops/fs/xfs/xfsdist/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + + printf "${GREEN}if need, see helps:t-ops fs xfs xfsdist -h${NC}\n" + /usr/share/bcc/tools/xfsdist $@ +} + +main $* diff --git a/ops/healthcheck/build_healthcheck_bin/README.md b/ops/healthcheck/build_healthcheck_bin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a7314e3200f951db0c6f1f9d5524aa548dc809b2 --- /dev/null +++ b/ops/healthcheck/build_healthcheck_bin/README.md @@ -0,0 +1,30 @@ +# build_healthcheck_bin 目录说明 + +本目录用于 t-ops-healthcheck 项目的自动化打包二进制文件流程,包含以下内容: + +- `build.sh` + 自动化打包脚本。请在本目录下运行该脚本(即:`./build.sh`),它会自动生成 hidden_imports 列表,并调用 PyInstaller 对项目主程序进行打包。 + +- `gen_hidden_imports.py` + 辅助脚本,用于自动生成 PyInstaller 打包时所需的 hidden_imports 参数列表,缺省只包含项目根目录下utilities目录所有.py结尾的文件。 + +- `hidden_imports.txt` + 由 `gen_hidden_imports.py` 生成,记录 PyInstaller 打包时需要额外指定的隐藏依赖模块(静态代码分析工具无法发现需动态加载的依赖模块)。 + +- `build/` + PyInstaller 打包过程中生成的临时工作目录。 + +- `dist/` + PyInstaller 打包完成后生成的可执行二进制文件存放目录。 + +- `tos-health-check.spec`(如存在) + PyInstaller 打包配置文件。 + +## 使用说明 + +1. 需确保已安装 Python3 及 PyInstaller。 +2. 进入本目录后,运行 `./build.sh` 脚本进行自动化打包。 +3. 打包完成后,生成的二进制文件会在本目录下的 `dist/` 文件夹中。 + +> **注意:** +> `build.sh` 只能在本目录下运行,否则路径依赖会导致脚本无法正常工作。 diff --git a/ops/healthcheck/build_healthcheck_bin/build.sh b/ops/healthcheck/build_healthcheck_bin/build.sh new file mode 100755 index 0000000000000000000000000000000000000000..fd2a4504bab3a01863f08fa61fef83c250c920d2 --- /dev/null +++ b/ops/healthcheck/build_healthcheck_bin/build.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# 判断是否在 build_healthcheck_bin 目录下执行 +if [ "$(basename "$(pwd)")" != "build_healthcheck_bin" ]; then + echo "请 cd 到 build_healthcheck_bin 目录下执行此脚本!" + exit 1 +fi + +set -e +set -u + +echo "Step 1: Switching to project root directory..." +cd "$(cd "$(dirname "$0")" && pwd)/.." +echo "Current directory: $(pwd)" + +echo "Step 2: Cleaning old build, dist directories and all __pycache__ folders..." +#rm -rf build_healthcheck_bin/build build_healthcheck_bin/dist +find . -type d -name "__pycache__" -exec rm -rf {} + +echo "Clean up completed." + +# 删除旧的 spec 文件,防止影响输出文件名 +rm -f build_healthcheck_bin/t-ops-healthcheck.spec + +echo "Step 3: Generating hidden_imports.txt..." +python3 build_healthcheck_bin/gen_hidden_imports.py > build_healthcheck_bin/hidden_imports.txt +echo "hidden_imports.txt generated. Content:" +cat build_healthcheck_bin/hidden_imports.txt +echo "----------------------------------------" + +echo "Step 4: Packaging main.py with PyInstaller..." +if ! pyinstaller -v ; then + cat << 'EOF' +#################################### +PyInstaller not found. Solution: +1. Execute 'setup_dev_env.sh' located at ../ +2. Activate virtual environment: 'source .venv/bin/activate' +3. Re-execute 'build.sh' +EOF + + exit 1 +fi + +if ! pyinstaller --onefile main.py \ + --distpath build_healthcheck_bin/dist \ + --workpath build_healthcheck_bin/build \ + --specpath build_healthcheck_bin \ + --paths="$(pwd)" \ + --name tos-health-check \ + $(cat build_healthcheck_bin/hidden_imports.txt); then + echo "PyInstaller packaging failed." + + cat << 'EOF' +可先尝试重新编译 Python3.11 再运行 build.sh,方法如下: +1.) # 安装libffi-dev + yum install libffi-devel +2.) # 下载源码包, 根据实际情况替换 ,已验证 Python3.11.12 可用,但是鼓励尝试使用最新版本 python3.11 + wget https://www.python.org/ftp/python/3.11./Python-3.11..tgz +3.) # 创建编译产物的目录(不要修改目录的路径与名称) + mkdir python3.11-build +4.) # 解压文件至项目根目录 + tar xzf Python-3.11..tgz +5.) cd Python-3.11. +6.) CFLAGS="-Wno-error" ./configure --prefix=/usr/local --enable-optimizations --with-system-ffi --enable-shared +7.) make -j$(nproc) +8.) make install DESTDIR=../python3.11-build +9.) export LD_LIBRARY_PATH=$(pwd)/../python3.11-build/usr/local/lib:$LD_LIBRARY_PATH +10.) # 将编译的 python3.11 添加至 PATH 环境变量(可选操作步骤) + export PATH=$(pwd)/../python3.11-build/usr/local/bin:$PATH +11.) # 验证Python版本 + ../python3.11-build/usr/local/bin/python3.11 -c "import platform; print(platform.python_version())" +EOF + exit 1 +fi + +echo "PyInstaller packaging completed." + +echo "Step 5: Build result" +if [ -f build_healthcheck_bin/dist/tos-health-check ]; then + echo "Build succeeded! Binary is at: build_healthcheck_bin/dist/tos-health-check" +else + echo "Build failed. Please check the log above." +fi + diff --git a/ops/healthcheck/build_healthcheck_bin/gen_hidden_imports.py b/ops/healthcheck/build_healthcheck_bin/gen_hidden_imports.py new file mode 100644 index 0000000000000000000000000000000000000000..8a6fde899d90e2f3f5bae1622350d3cdf0434c34 --- /dev/null +++ b/ops/healthcheck/build_healthcheck_bin/gen_hidden_imports.py @@ -0,0 +1,14 @@ + +import os + +def gen_hidden_imports(root): + for dirpath, dirnames, filenames in os.walk(root): + for f in filenames: + if f.endswith('.py') and f != '__init__.py': + rel_path = os.path.relpath(os.path.join(dirpath, f), os.getcwd()) + mod = rel_path.replace('/', '.').replace('\\', '.').replace('.py', '') + print(f'--hidden-import={mod} ', end='') + +# 必须在项目根目录下的第一级子目录下执行 +gen_hidden_imports('utilities') +#gen_hidden_imports('checks') diff --git a/ops/healthcheck/build_healthcheck_bin/hidden_imports.txt b/ops/healthcheck/build_healthcheck_bin/hidden_imports.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe9404122c6e31fcf2928e18e1f4e92ddbd53471 --- /dev/null +++ b/ops/healthcheck/build_healthcheck_bin/hidden_imports.txt @@ -0,0 +1 @@ +--hidden-import=utilities.file_content_utils --hidden-import=utilities.runcmd \ No newline at end of file diff --git a/ops/healthcheck/build_healthcheck_bin/tos-health-check.spec b/ops/healthcheck/build_healthcheck_bin/tos-health-check.spec new file mode 100644 index 0000000000000000000000000000000000000000..8e6362bb4e3d3379034722d66ede1889e1cf4be1 --- /dev/null +++ b/ops/healthcheck/build_healthcheck_bin/tos-health-check.spec @@ -0,0 +1,38 @@ +# -*- mode: python ; coding: utf-8 -*- + + +a = Analysis( + ['../main.py'], + pathex=['/data/tencentos-tools/ops/healthcheck'], + binaries=[], + datas=[], + hiddenimports=['utilities.file_content_utils', 'utilities.runcmd'], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + noarchive=False, + optimize=0, +) +pyz = PYZ(a.pure) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.datas, + [], + name='tos-health-check', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=True, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, +) diff --git a/ops/healthcheck/checks/mem/saturation/zone_memory_pressure_check.py b/ops/healthcheck/checks/mem/saturation/zone_memory_pressure_check.py new file mode 100644 index 0000000000000000000000000000000000000000..ffe7170d4fb23e9a73c6fc63dba8b5bc77b9ef48 --- /dev/null +++ b/ops/healthcheck/checks/mem/saturation/zone_memory_pressure_check.py @@ -0,0 +1,103 @@ +# checks/mem/saturation/zoneinfo_mem_pressure_check.py + +from core.base_plugin import BasePlugin +from core.datacache import DataCache +from utilities import file_content_utils +import io + +class ZoneinfoMemoryPressureCheck(BasePlugin): + """Check memory pressure for each node/zone in /proc/zoneinfo.""" + + name = "Zoneinfo Memory Pressure Check" + version = "1.1.0" + description = ( + f"通过比较 /proc/zoneinfo 中每个 Node/Zone 的free与high/low/min水位线,检查内存压力。\n" + f"Checks each node/zone in /proc/zoneinfo for memory pressure " + f"by comparing free pages with high/low/min watermarks." + + ) + solution = ( + "建议结合其他检查插件、 top 以及 /proc/zoneinfo 等信息排查内存消耗较大的类型和进程。\n" + "Consider investigating memory consumers from /proc/zoneinfo or tuning memory." + + ) + + subsystem = "mem" + category = "saturation" + + async def push_result(self) -> dict | None: + """Check /proc/zoneinfo for memory pressure and return result dict.""" + zoneinfo_cache: DataCache = ( + self.data_caches_dict.get("mem_proc_zoneinfo") + if self.data_caches_dict else None + ) + if not zoneinfo_cache: + return self.build_check_result( + details="No mem_proc_zoneinfo data available.", + level="ERROR" + ) + + content = zoneinfo_cache.get_content() + zoneinfo_data = file_content_utils.parse_proc_zoneinfo(content) + + zone_statuses = [] + overall_level = "OK" + missing_zones = [] + + # 逐个 zone 检查并记录最严重的 level + for node_id, zones in zoneinfo_data.items(): + for zone_name, stats in zones.items(): + free = stats.get("free") + high = stats.get("high") + low = stats.get("low") + min = stats.get("min") + if None in (free, high, low, min): + missing_zones.append((node_id, zone_name)) + continue + + level = "OK" + # 规则判定 + if free < min: + level = "CRITICAL" + elif free < low: + level = "WARNING" + elif high != 0: + if free / high <= 1.2: + level = "INFO" + + zone_statuses.append({ + "level": level, + "node_id": node_id, + "zone_name": zone_name, + "free": free, + "min": min, + "low": low, + "high": high + }) + + # 记录最严重的level + if level == "CRITICAL": + overall_level = "CRITICAL" + elif level == "WARNING" and overall_level != "CRITICAL": + overall_level = "WARNING" + elif level == "INFO" and overall_level not in ("CRITICAL", "WARNING"): + overall_level = "INFO" + + # 生成 details 字符串 + details_buffer = io.StringIO() + details_buffer.write("Zone pressure summary:\n") + for z in zone_statuses: + details_buffer.write( + f"Node {z['node_id']} Zone {z['zone_name']}: " + f"free={z['free']}, min={z['min']}, low={z['low']}, high={z['high']} => {z['level']}\n" + ) + if missing_zones: + details_buffer.write(f"\n[WARNING] The following zones have incomplete data and were skipped:\n") + for node_id, zone_name in missing_zones: + details_buffer.write(f" Node {node_id} Zone {zone_name}") + details = details_buffer.getvalue() + + return self.build_check_result( + details=details, + level=overall_level + ) \ No newline at end of file diff --git a/ops/healthcheck/checks/mem/utilization/meminfo_usages_estimation_check.py b/ops/healthcheck/checks/mem/utilization/meminfo_usages_estimation_check.py new file mode 100644 index 0000000000000000000000000000000000000000..4301d049190df9f4e43e1308aabc75345a1d4ec3 --- /dev/null +++ b/ops/healthcheck/checks/mem/utilization/meminfo_usages_estimation_check.py @@ -0,0 +1,240 @@ +#checks/mem/utilization/meminfo_usages_estimation_check.py +import re +import logging +from typing import Any +from core.datacache import DataCache +from core.base_plugin import BasePlugin, MetricLevelMapper, ComparisonOperator +from utilities import file_content_utils, runcmd +from core.base_plugin import BasePlugin + +import logging + +logger = logging.getLogger("TOS_Health_Check") + +class MeminfoUsageEstimationCheck(BasePlugin): + """Estimate and display detailed memory usage based on /proc/meminfo. + + This plugin parses /proc/meminfo and calculates estimated memory usage + breakdowns, including user space, page cache, kernel memory, and shared + memory. The calculation formulas are as follows: + + - Σtasks_Pss: Mapped + AnonPages + - Page Cache: Active(file) + Inactive(file) + Shmem + SwapCached + - Kernel used memory: SReclaimable + SUnreclaim + VmallocUsed + + PageTables + KernelStack + HardwareCorrupted + Bounce + Percpu + - Kernel direct alloc_pages() used memory: + MemTotal - MemFree - Kernel memory - + (Active + Inactive + Unevictable + (HugePages_Total * Hugepagesize)) + - Shmem: + - system_v_shared: + - posix_shared: + - tmpfs: df -BM -ha -t tmpfs + - devtmpfs: df -BM -ha -t devtmpfs + - mmap_annoymous_shared: Shmem - System_V - POSIX_shared - tmpfs - devtmpfs + + This check always returns level above OK. + """ + + name: str = "Meminfo Usage Estimation Check" + version: str = "1.0.0" + description: str = ( + f"解析 /proc/meminfo 并估算内存类型的使用情况,包括内核内存、用户态内存、文件页缓存、共享内存," + f"提供全面的内存使用类型消耗视图。如果内核内存 > MemTotal*10% 则报告告警。\n" + f"Parses /proc/meminfo and estimates detailed memory usage breakdown, " + f"including user space, page cache, kernel memory, and shared memory. " + f"Provides a comprehensive view of memory consumption using formulas " + f"based on various meminfo fields." + ) + subsystem: str = "mem" + category: str = "utilization" + solution: str = ( + f"请查看 'details' 字段中的详细内存使用情况。如果内存压力较大,建议优化应用程序内存使用、" + f"调整页缓存,或排查内核内存消耗。对于共享内存,请检查 tmpfs、devtmpfs 及其他共享内存段。\n" + f"Review the detailed memory usage breakdown in the 'details' field. " + f"If memory pressure is high, consider tuning application memory usage, " + f"optimizing page cache, or investigating kernel memory consumers. " + f"For shared memory, check tmpfs, devtmpfs, and other shared memory segments." + ) + + metric_level_mapper = MetricLevelMapper( + warning=0.10, + error=0.15, + critical=0.20, + comparison_operator=ComparisonOperator.GE + ) + + default_level = "INFO" + + async def push_result(self) -> dict[str, Any] | None: + proc_meminfo: DataCache | None = self.data_caches_dict.get( + "mem_proc_meminfo") if self.data_caches_dict else None + + if not proc_meminfo: + return self.build_check_result( + details="No mem_proc_meminfo data available.", + level="ERROR" + ) + + proc_meminfo_content: str = proc_meminfo.get_content() + meminfo: dict[str, int] = file_content_utils.parse_common_key_value_content( + proc_meminfo_content + ) + + KB_IN_GB = 1024 * 1024 + + def kb2gb(val: int) -> float: + return round(val / KB_IN_GB, 2) + + # Helper function to get value or 0 if missing + def get_value(key: str) -> int: + return meminfo.get(key, 0) + + # Σtasks_Pss: Mapped + AnonPages + mapped = get_value("Mapped") + anonpages = get_value("AnonPages") + tasks_pss = mapped + anonpages + + # Page Cache: Active(file) + Inactive(file) + Shmem + SwapCached + active_file = get_value("Active(file)") + inactive_file = get_value("Inactive(file)") + shmem = get_value("Shmem") + swapcached = get_value("SwapCached") + page_cache = active_file + inactive_file + shmem + swapcached + + # Explicit kernel used memory + sreclaimable = get_value("SReclaimable") + sunreclaim = get_value("SUnreclaim") + vmallocused = get_value("VmallocUsed") + pagetables = get_value("PageTables") + kernelstack = get_value("KernelStack") + hardwarecorrupted = get_value("HardwareCorrupted") + bounce = get_value("Bounce") + percpu = get_value("Percpu") + explicit_kernel_used = ( + sreclaimable + sunreclaim + vmallocused + pagetables + + kernelstack + hardwarecorrupted + bounce + percpu + ) + + # Kernel direct alloc_pages() used memory + memtotal = get_value("MemTotal") + memfree = get_value("MemFree") + active = get_value("Active") + inactive = get_value("Inactive") + unevictable = get_value("Unevictable") + hugepages_total = get_value("HugePages_Total") + hugepages_size = get_value("Hugepagesize") + implicit_kernel_alloc_pages = ( + memtotal - memfree - explicit_kernel_used - + (active + inactive + unevictable + (hugepages_total * hugepages_size)) + ) + + kernel_used_total = explicit_kernel_used + implicit_kernel_alloc_pages + if memtotal: + kernel_used_ratio = kernel_used_total / memtotal + else: + kernel_used_ratio = 0 + + # ----------- Shmem in details----------- + # 1. System V shared memory + system_v_shared_kb = 0 + stdout, _, _ = runcmd.execute_command("ipcs -m") + for line in stdout.splitlines(): + if re.match(r"^0x", line): + parts = line.split() + if len(parts) >= 5: + try: + system_v_shared_kb += int(parts[4]) // 1024 + except Exception as e: + logger.error(f"parse error: {e}, line: {line}") + + # 2. POSIX shared memory + posix_shared_kb = 0 + stdout, _, _ = runcmd.execute_command("df -t tmpfs /dev/shm") + for line in stdout.splitlines(): + if "/dev/shm" in line: + parts = line.split() + if len(parts) >= 3: + try: + posix_shared_kb = int(parts[2]) # Used列 + except Exception: + logger.error(f"parse error: {e}, line: {line}") + + # 3. tmpfs + tmpfs_kb = 0 + stdout, _, _ = runcmd.execute_command("df -t tmpfs | grep -v '/tmp/shm'") + + for line in stdout.splitlines(): + if line.startswith("Filesystem"): + continue + parts = line.split() + if len(parts) >= 3: + try: + tmpfs_kb += int(parts[2]) # 'Used' column + except Exception as e: + logger.error(f"parse error: {e}, line: {line}") + + + # 4. devtmpfs + devtmpfs_kb = 0 + stdout, _, _ = runcmd.execute_command("df -t devtmpfs") + for line in stdout.splitlines(): + if line.startswith("Filesystem"): + continue + parts = line.split() + if len(parts) >= 3: + try: + devtmpfs_kb += int(parts[2]) # # 'Used' column + except Exception: + logger.error(f"parse error: {e}, line: {line}") + # 5. mmap shared annoymous memory + mmap_annoymous_shared_kb = shmem - system_v_shared_kb - posix_shared_kb - tmpfs_kb - devtmpfs_kb + + details = ( + f"Kernel used memory: total {kb2gb(kernel_used_total)} GB, radio {kernel_used_ratio:.2%}\n" + + f"Page Cache: {kb2gb(page_cache)} GB\n" + f" - Active(file) {kb2gb(active_file)} GB\n" + f" - +Inactive(file) {kb2gb(inactive_file)} GB\n" + f" - +Shmem {kb2gb(shmem)} GB\n" + f" - +SwapCached {kb2gb(swapcached)} GB\n" + + f"Explicit kernel used memory: {kb2gb(explicit_kernel_used)} GB\n" + f" - SReclaimable {kb2gb(sreclaimable)} GB\n" + f" - +SUnreclaim {kb2gb(sunreclaim)} GB\n" + f" - +VmallocUsed {kb2gb(vmallocused)} GB\n" + f" - +PageTables {kb2gb(pagetables)} GB\n" + f" - +KernelStack {kb2gb(kernelstack)} GB\n" + f" - +HardwareCorrupted {kb2gb(hardwarecorrupted)} GB\n" + f" - +Bounce {kb2gb(bounce)} GB\n" + f" - +Percpu {kb2gb(percpu)} GB\n" + + f"Implicit kernel used memory eg. alloc_pages(): {kb2gb(implicit_kernel_alloc_pages)} GB\n" + f" MemTotal {kb2gb(memtotal)} GB\n" + f" - -MemFree {kb2gb(memfree)} GB\n" + f" - -Explicit Kernel used {kb2gb(explicit_kernel_used)} GB\n" + f" - -User space application used: {kb2gb(active + inactive + unevictable + hugepages_total * hugepages_size)} GB\n" + f" - Active {kb2gb(active)} GB\n" + f" - +Inactive {kb2gb(inactive)} GB\n" + f" - +Unevictable {kb2gb(unevictable)} GB\n" + f" - +(HugePages_Total {hugepages_total} * Hugepagesize {kb2gb(hugepages_size)}) GB\n" + + f"Σtasks_Pss: {kb2gb(tasks_pss)} GB\n" + f" - Mapped {kb2gb(mapped)} GB\n" + f" - +AnonPages {kb2gb(anonpages)} GB)\n" + + f"Shmem in /proc/meminfo: {kb2gb(shmem)} GB\n" + f" - system_v_shared: {kb2gb(system_v_shared_kb)} GB\n" + f" - +posix_shared: {kb2gb(posix_shared_kb)} GB\n" + f" - +tmpfs: {kb2gb(tmpfs_kb)} GB\n" + f" - +devtmpfs: {kb2gb(devtmpfs_kb)} GB\n" + f" - +mmap_annoymous_shared: {kb2gb(mmap_annoymous_shared_kb)} GB" + ) + + mapped_level = self.metric_level_mapper.map(kernel_used_ratio) + if mapped_level is None: + return self.handle_metric_not_matched(kernel_used_ratio, details) + else: + return self.build_check_result( + details=details, + metric_value=kernel_used_ratio + ) \ No newline at end of file diff --git a/ops/healthcheck/checks/mem/utilization/total_mem_usage_check.py b/ops/healthcheck/checks/mem/utilization/total_mem_usage_check.py new file mode 100644 index 0000000000000000000000000000000000000000..f1f66e8197254508d23ad53587df76b88110ef60 --- /dev/null +++ b/ops/healthcheck/checks/mem/utilization/total_mem_usage_check.py @@ -0,0 +1,55 @@ +#checks/mem/utilization/total_mem_usage_check.py +from core.datacache import DataCache +from core.base_plugin import BasePlugin, MetricLevelMapper, ComparisonOperator +from utilities import file_content_utils + +class CheckMemoryUsage(BasePlugin): + name: str = "Total Memory Usage Check" + version: str = "2.1.0" + description: str = "Checks the total memory usage percent of the system." + solution: str = "Check the 'Details' for further information." + subsystem: str = "mem" + category: str = "utilization" + + metric_level_mapper = MetricLevelMapper( + info=97, warning=98, error=99, + comparison_operator=ComparisonOperator.GE + ) + + async def push_result(self) -> dict | None: + + proc_meminfo: DataCache | None = self.data_caches_dict.get( + "mem_proc_meminfo") if self.data_caches_dict else None + + if not proc_meminfo: + return self.build_check_result( + details="No mem_proc_meminfo data available.", + level="ERROR" + ) + + proc_meminfo_content = proc_meminfo.get_content() + result = file_content_utils.parse_common_key_value_content( + proc_meminfo_content + ) + mem_usage_percent = round( + ( + result["MemTotal"] - result["MemFree"] + ) / result["MemTotal"] * 100, 2 + ) + + details = ( + f"Memory usage percent: {mem_usage_percent}%, " + + f"MemTotal:{result['MemTotal']} kB, MemFree:{result['MemFree']} kB" + ) + + mapped_level = self.metric_level_mapper.map(mem_usage_percent) + if mapped_level is None: + return self.handle_metric_not_matched( + mem_usage_percent, + details=details + ) + + return self.build_check_result( + details=details, + metric_value=mem_usage_percent + ) diff --git a/ops/healthcheck/checks/misc/others/get_base_services_usage.sh b/ops/healthcheck/checks/misc/others/get_base_services_usage.sh new file mode 100755 index 0000000000000000000000000000000000000000..765d29682a21b7fdb91729195df6a0d9aecd27a7 --- /dev/null +++ b/ops/healthcheck/checks/misc/others/get_base_services_usage.sh @@ -0,0 +1,137 @@ +#!/bin/bash +#Get system base services CPU and Mem usages +#Author: jingqunli@tencent.com +#Release date: 202503 + +set -eu +set -o pipefail + +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin" +export LANG=C +export LC_ALL=C + +readonly PROCESSES=( + "crond" + "rsyslogd" + "udevd" + "chronyd" + "ntpd" + "logind" + "journald" + "networkd" + "systemd" +) +readonly PIDSTAT_INTERVAL=10 +readonly PIDSTAT_COUNT=1 +readonly LOGFILE="/tmp/$(basename $0 .sh).log" + +# Redirect all output to the log file +exec > >(tee -a "$LOGFILE") 2>&1 + +# Function to log info +log_info() { + local cur_time=$(date +'%Y-%m-%dT%H:%M:%S%z') + echo "$cur_time INFO: $@" +} + +# Function to log error +log_err() { + local cur_time=$(date +'%Y-%m-%dT%H:%M:%S%z') + echo "$cur_time ERROR: $@" >&2 +} + +#### flock BEGIN #### +readonly LOCKFILE="/run/lock/$(basename $0 2>/dev/null)" +readonly LOCKFD=99 +# PRIVATE +_lock() { flock -$1 ${LOCKFD}; } +_no_more_locking() { _lock u; _lock xn && rm -f ${LOCKFILE}; } +_prepare_locking() { eval "exec ${LOCKFD}>\"${LOCKFILE}\""; trap _no_more_locking SIGINT SIGTERM SIGKILL EXIT; } +## PUBLIC +exlock_now() { _lock xn; } # obtain an exclusive lock immediately or fail +exlock() { _lock x; } # obtain an exclusive lock +shlock() { _lock s; } # obtain a shared lock +unlock() { _lock u; } # drop a lock +#### flock END #### + +function check_if_exclusive_exec_or_exit() { + _prepare_locking + exlock_now || { log_err "One instance of $0 is running, exit" && exit 1; } +} + +function get_pids() { + local regex + local pids + + regex=$(IFS="|"; echo "${PROCESSES[*]}") + pids=$(pgrep -d, -f "$regex" || true) + + if [[ -z "$pids" ]]; then + log_err "No specified processes found." + exit 1 + fi + echo "$pids" +} + +function collect_cpu_usage() { + local pids="$1" + log_info "Collecting CPU usage for PIDs: $pids" + + pidstat -u -p "$pids" "$PIDSTAT_INTERVAL" "$PIDSTAT_COUNT" | grep "Average" | \ + awk ' + /CPU.*Command$/ { + # 找到标题行,遍历所有字段,找出%CPU所在列号 + for(i=1; i<=NF; i++) { + if ($i == "%CPU") cpu_col = i + } + next + } + { + # 打印最后一列(命令名)和cpu_col列的值 + print $NF, $(cpu_col) + } + ' | while read -r command cpu_usage; do + echo "Command: $command, %CPU: $cpu_usage"; + done +} + +function collect_mem_usage() { + local pids="$1" + log_info "Collecting memory usage for PIDs: $pids" + + pidstat -r -p "$pids" "$PIDSTAT_INTERVAL" "$PIDSTAT_COUNT" | grep "Average" | \ + awk ' + /RSS.*Command$/ { + # 找到标题行,遍历所有字段,找出RSS所在列号 + for(i=1; i<=NF; i++) { + if ($i == "RSS") rss_col = i + } + next + } + { + # 打印最后一列(命令名)和rss_col列的值 + print $NF, $(rss_col) + } + ' | while read -r command rss; do + echo "Command: $command, MEM RSS: $rss KB"; + done +} + +function main() { + local pids + + pids=$(get_pids) + if [[ -z "$pids" ]]; then + log_err "No PIDs found." + exit 1 + fi + collect_cpu_usage "$pids" + collect_mem_usage "$pids" +} + +trap 'log_err "An unexpected error occurred"' ERR +check_if_exclusive_exec_or_exit + +# Clear the log file at the start of each run +> "$LOGFILE" +main diff --git a/ops/healthcheck/checks/misc/others/system_base_servces_usages.py b/ops/healthcheck/checks/misc/others/system_base_servces_usages.py new file mode 100644 index 0000000000000000000000000000000000000000..108e2f0668a67ffcbbda6319924de93122bcbb87 --- /dev/null +++ b/ops/healthcheck/checks/misc/others/system_base_servces_usages.py @@ -0,0 +1,116 @@ +import os +import re +from core.base_plugin import BasePlugin, MetricLevelMapper, ComparisonOperator +from utilities import runcmd + +class BaseServiceCheck(BasePlugin): + name = "System Base Service Resource Check" + version = "1.0.0" + description = ( + f"使用 get_base_services_usage.sh 检查操作系统基础服务的 CPU 和内存情况。" + f"如果有进程 CPU 超过 4% 或内存 RSS 超过 100MB,则报告警告。\n" + f"Checks the CPU and memory usage of base system services using check_base_service.sh. " + f"Reports warning if any process CPU > 4% or RSS > 100MB." + ) + subsystem = "misc" + category = "others" + solution = ( + f"如果某个操作系统基础服务进程的 CPU 或内存使用率过高," + f"请检查该服务或者进程的状态(systemctl status XXXXX或者journalctl -u XXXXX等)," + f"并考虑重启或优化该服务。如有需要,请联系系统管理员。\n" + f"If a system base service process uses excessive CPU or memory, " + f"please check the process status, logs, and consider restarting or tuning the service. " + f"Consult your system administrator if necessary." + ) + + metric_level_mapper = MetricLevelMapper( + info=0, + warning=1, + error=2, + critical=10, + comparison_operator=ComparisonOperator.GE + ) + default_level = "OK" + + SCRIPT_NAME: str = "get_base_services_usage.sh" + CPU_THRESHOLD: float = 4.0 + RSS_THRESHOLD_KB: int = 102400 # 100MB + + def _get_script_path(self) -> str: + return os.path.join(os.path.dirname(os.path.abspath(__file__)), self.SCRIPT_NAME) + + def _parse_metrics(self, output: str) -> list[dict]: + """ + 解析脚本输出,返回每个进程的资源使用情况 + """ + results = [] + for line in output.splitlines(): + # 解析CPU + m_cpu = re.match(r'Command:\s*(\S+),\s*%CPU:\s*([0-9.]+)', line) + if m_cpu: + results.append({ + "type": "cpu", + "command": m_cpu.group(1), + "value": float(m_cpu.group(2)) + }) + continue + # 解析RSS + m_rss = re.match(r'Command:\s*(\S+),\s*MEM RSS:\s*([0-9.]+)\s*KB', line) + if m_rss: + results.append({ + "type": "rss", + "command": m_rss.group(1), + "value": float(m_rss.group(2)) + }) + return results + + def push_result(self) -> dict | None: + details = "" + script_path = self._get_script_path() + + stdout, stderr, rc = runcmd.execute_command(script_path, timeout=150) + + script_output = "" + if stdout: + script_output += f"\n- stdout:\n{stdout.strip()}" + if stderr: + script_output += f"\n- stderr:\n{stderr.strip()}" + + if rc != 0: + details = f"{self.SCRIPT_NAME} execute failed." + details += script_output + return self.build_check_result( + details=details, + level="ERROR" + ) + + metrics = self._parse_metrics(stdout) + if not metrics: + details += f"Failed to parse metrics from {self.SCRIPT_NAME} output." + details += script_output + return self.build_check_result( + details=details, + level="ERROR" + ) + + warning_items = [] + for item in metrics: + if item["type"] == "cpu" and item["value"] > self.CPU_THRESHOLD: + warning_items.append(f"{item['command']} CPU {item['value']}%") + if item["type"] == "rss" and item["value"] > self.RSS_THRESHOLD_KB: + warning_items.append(f"{item['command']} RSS {item['value']/1024:.1f}MB") + + if warning_items: + details += "High resource usage detected: " + "; ".join(warning_items) + details += script_output + return self.build_check_result( + details=details, + level="WARNING" + ) + else: + details += "All base service processes are within resource limits." + details += script_output + return self.build_check_result( + details=details, + level="OK" + ) diff --git a/ops/healthcheck/checks/misc/others/system_time_offset_check.py b/ops/healthcheck/checks/misc/others/system_time_offset_check.py new file mode 100644 index 0000000000000000000000000000000000000000..8a74883384d7ff3a8738f793c13bd49e267f5dba --- /dev/null +++ b/ops/healthcheck/checks/misc/others/system_time_offset_check.py @@ -0,0 +1,98 @@ +#checks/misc/others/system_time_offset_check.py +import os +import re +from typing import Any +from core.base_plugin import BasePlugin, MetricLevelMapper, ComparisonOperator +from utilities import runcmd + +class TimeOffsetCheck(BasePlugin): + name = "System Time Offset Check" + version = "1.0.0" + description = ( + f"使用 check_time_offset.sh 检查系统时间与 NTP 服务器的偏移量。" + f"如果时间偏移超过推荐阈值,则进行告警。\n" + f"Checks the system time offset against NTP servers using check_time_offset.sh. " + f"Reports if the time offset exceeds recommended thresholds." + ) + subsystem = "misc" + category = "others" + solution = ( + f"如果时间偏移过大,请检查 NTP/CHRONY 配置、ntp网络连通性,并确保系统能够访问可靠的 NTP 服务器。" + f"重启时间同步服务(如 ntpd 或 chronyd)一般会让系统时间以跳变方式恢复正常,但这可能引发应用异常。" + f"在重启时间同步服务前请评估影响,并在必要时联系系统管理员。\n" + f"If the time offset is too large, please check NTP/CHRONY configuration, " + f"network connectivity, and ensure the system can reach reliable NTP servers. " + f"Restarting time sync services (such as ntpd or chronyd) may cause " + f"the service step the system time and recover, but this could lead " + f"to business/application anomalies. Please evaluate the impact before " + f"restarting time synchronization services, and consult your system " + f"administrator if necessary." + ) + + metric_level_mapper = MetricLevelMapper( + info=0.125, + warning=1, + error=10, + critical=60, + comparison_operator=ComparisonOperator.GE + ) + default_level = "OK" + + SCRIPT_NAME :str = "get_system_time_offset.sh" + + def _get_script_path(self) -> str: + return os.path.join(os.path.dirname(os.path.abspath(__file__)), self.SCRIPT_NAME) + + def _parse_offset(self, output: str) -> float | None: + """ + 从脚本输出中提取 offset 数值(单位:秒,正数) + """ + # Time Offset xxxs + m = re.search(r'Time offset\s*([0-9.]+)s', output) + if m: + try: + return float(m.group(1)) + except Exception: + return None + + return None + + def push_result(self) -> dict | None: + details = "" + script_path = self._get_script_path() + + stdout, stderr, rc = runcmd.execute_command(script_path, timeout=150) + + script_output = "" + if stdout: + script_output += f"\n- stdout:\n{stdout.strip()}" + if stderr: + script_output += f"\n- stderr:\n{stderr.strip()}" + + if rc != 0: + details = f"{self.SCRIPT_NAME} execute failed." + details += script_output + return self.build_check_result( + details=details, + level="ERROR" + ) + + offset = self._parse_offset(stdout) + if offset is None: + details += f"[ERROR] Failed to parse time offset from {self.SCRIPT_NAME} output." + details += script_output + return self.build_check_result( + details=details, + level="ERROR" + ) + + details += f"[Parsed time offset] {offset} seconds" + + mapped_level = self.metric_level_mapper.map(offset) + if mapped_level is None: + return self.handle_metric_not_matched(offset, details) + else: + return self.build_check_result( + details=details, + metric_value=offset + ) diff --git a/ops/healthcheck/config.yaml b/ops/healthcheck/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..22c604a6418a84d4d9d4002d2bdcb1a7b4a764e2 --- /dev/null +++ b/ops/healthcheck/config.yaml @@ -0,0 +1,25 @@ +# config.yaml +TOS_Health_Check: + log_level: INFO + log_format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + log_file_directory: "/var/log/t-ops/" + log_console: True + developer_mode: False + +total_mem_usage_check: + enabled: True + info_throttle: + memory_usage_percent: 82% + free_usage_amount: 4 + warning_throttle: + memory_usage_percent: 82% + free_usage_amount: 4GB + error_throttle: + memory_usage_percent: 80% # 内存使用率阈值 + memory_usage_amount: 4GB # 内存使用量阈值 + +zone_memory_pressure_check: + enabled: True +meminfo_usages_estimation_check: + enabled: True + diff --git a/ops/healthcheck/core/base_plugin.py b/ops/healthcheck/core/base_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..09cd0a5b003b53b023928547c0b2f4e044b6c44e --- /dev/null +++ b/ops/healthcheck/core/base_plugin.py @@ -0,0 +1,326 @@ +# core/base_plugin.py +from dataclasses import dataclass +from enum import Enum, auto +from typing import Optional, List, Dict, Callable, Union, Literal, ClassVar +import operator +import asyncio +from abc import ABC, abstractmethod +from core.check_item import CheckItem +from core.logger import logging +from core.datacache import DataCache + +logger = logging.getLogger("TOS_Health_Check") + +# -------------------------- +# 1. 枚举类型定义Level +# -------------------------- +class LevelEnum(Enum): + OK = auto() + INFO = auto() + WARNING = auto() + CRITICAL = auto() + ERROR = auto() + + @classmethod + def from_str(cls, s: str): + """支持字符串转LevelEnum""" + try: + return cls[s.upper()] + except (KeyError, AttributeError): + raise ValueError(f"Invalid level string: {s}") + + def __str__(self): + return self.name + +class ComparisonOperator(Enum): + GT = 'GT' + GE = 'GE' + LT = 'LT' + LE = 'LE' + EQ = 'EQ' + NE = 'NE' + + def get_operator(self) -> Callable[[float, float], bool]: + + operators = { + ComparisonOperator.GT: operator.gt, + ComparisonOperator.GE: operator.ge, + ComparisonOperator.LT: operator.lt, + ComparisonOperator.LE: operator.le, + ComparisonOperator.EQ: operator.eq, + ComparisonOperator.NE: operator.ne, + } + return operators[self] + +# -------------------------- +# 2. 根据threshold映射metric至sevetiry level +# -------------------------- +@dataclass +class MetricLevelMapper: + ok: Optional[float] = None + info: Optional[float] = None + warning: Optional[float] = None + critical: Optional[float] = None + error: Optional[float] = None + comparison_operator: ComparisonOperator = ComparisonOperator.GE + + def __post_init__(self): + self._validate() + + def _validate(self): + thresholds = [ + t for t in [ + self.ok, + self.info, + self.warning, + self.error, + self.critical + ] if t is not None + ] + + if self.comparison_operator in (ComparisonOperator.GE, + ComparisonOperator.GT): + required_order = sorted(thresholds) + if thresholds != required_order: + raise ValueError( + f"Invalid threshold order. For GE/GT, severity thresholds " + f"must be in ASCENDING order. \n" + f"Stricter conditions require LARGER thresholds " + f"(ok < info < warning < error < critical)." + ) + else: + required_order = sorted(thresholds, reverse=True) + if thresholds != required_order: + raise ValueError( + f"Invalid threshold order. For LE/LT, severity thresholds " + f"must be in DESCENDING order. \n" + f"Stricter conditions require SMALLER thresholds " + f"(ok > info > warning > error > critical).") + + def _get_levels_order(self) -> List[LevelEnum]: + if self.comparison_operator in (ComparisonOperator.GE, + ComparisonOperator.GT): + return [ + LevelEnum.CRITICAL, + LevelEnum.ERROR, + LevelEnum.WARNING, + LevelEnum.INFO, + LevelEnum.OK + ] + else: + return [ + LevelEnum.OK, + LevelEnum.INFO, + LevelEnum.WARNING, + LevelEnum.ERROR, + LevelEnum.CRITICAL + ] + + + def map(self, metric_value: float) -> Optional[LevelEnum]: + """ + 根据给定的指标值(metric_value),结合预设的阈值和比较运算符,映射出对应的等级(LevelEnum)。 + + 该方法会按照等级优先级顺序,依次比较每个等级的阈值(如 self.info, self.warning, self.critical, self.error等)。 + 对于每个等级,如果阈值不为 None,且 metric_value 与该阈值通过指定的比较运算符(如大于、小于等)比较为真, + 则返回该等级(LevelEnum)。如果所有等级都不匹配,则返回 None。 + + 参数: + metric_value (float): 需要进行等级映射的指标值。 + + 返回: + Optional[LevelEnum]: 匹配到的等级枚举值。如果没有任何等级匹配,则返回 None。 + + 行为说明: + - 比较运算符由 self.comparison_operator.get_operator() 提供,通常为大于、小于等函数。 + - 等级的顺序由 self._get_levels_order() 决定,必须从高到低或从低到高。 + - 每个等级的阈值通过 getattr(self, level.name.lower()) 获取,需保证类中有对应的属性。 + - 只要找到第一个匹配的等级即返回,不会继续检查后续等级。 + + 示例: + 假设有如下等级和阈值: + self.info = 10 + self.warning = 20 + self.critical = 30 + 并且比较运算符为大于等于(>=),等级顺序为 [critical, warning, info]。 + + - 若 metric_value = 25,则返回 warning + - 若 metric_value = 35,则返回 critical + - 若 metric_value = 5,则返回 None + + """ + operator = self.comparison_operator.get_operator() + for level in self._get_levels_order(): + threshold = getattr(self, level.name.lower()) + if threshold is not None and operator(metric_value, threshold): + return level + return None + +# -------------------------- +# 3. 继承CheckItem实现简单的通用插件基类 +# -------------------------- +class BasePlugin(CheckItem, ABC): + """健康检查插件基类 + BasePlugin 继承 CheckItem,拥有 CheckItem 的全部属性和方法。继承该类可以让插件管理模块识别为插件。 + CheckItem 只约束了插件的最基本属性和方法,而 BasePlugin 有用于构建检查结果的辅助方法等。 + 绝大部分插件应该优先直接继承 BasePlugin ,而非优化直接继承 CheckItem 。 + push_result() 是插件执行引擎的执行入口,子类只有重写 push_result() 才能返回检查结果。 + 重要约定:禁止修改该类的类属性。一旦改了,绝大部分子类的实例都会跟着变,容易出一些奇怪的 bug。 + """ + _REQUIRED_FIELDS: ClassVar[tuple[str, ...]] = ( + "name", "version", "description", + "subsystem", "category", "details", + "level", "solution" + ) + + # 子类可覆盖默认值 + name: str = "" + version: str = "" + description: str = "" + subsystem: str = "" + category: str = "" + details: str = "" + level: Union[str, LevelEnum] = "" + solution: str = "" + + default_level: str = "OK" + """ + 当指标值未命中任何阈值(即 metric_level_mapper.map(metric_value) 返回 None)时, + 作为检查结果的默认等级(level)返回。该属性可被子类覆盖以适配不同业务场景。 + 典型用法:在 handle_metric_not_matched 方法中引用,表示“正常”或“无风险”状态。 + """ + + metric_level_mapper: Optional[MetricLevelMapper] = None + + def __init__(self, + data_caches_dict: Optional[Dict[str, DataCache]]= None, **kwargs): + super().__init__(data_caches_dict=data_caches_dict) + # 扩展占位:允许子类通过kwargs传递其他参数 + for k, v in kwargs.items(): + setattr(self, k, v) + + def handle_metric_not_matched(self, metric_value: float, details: str) -> dict: + """ + Handle the case where the given metric value does not match (trigger) any defined threshold. + + This method is called when the metric value does not reach any of the levels (e.g., warning, critical) + defined in the metric_level_mapper. By default, it considers the result as normal and assigns the + default_level (usually "INFO") to the check result. The details field will include a clear note + indicating that no threshold was triggered. + + Subclasses can override this method to customize the behavior for unmatched metric values. + + Args: + metric_value (float): The metric value that was evaluated but did not match any threshold. + details (str): Additional information or context about the check. + + Returns: + dict: The check result dictionary, with details and the default level. + """ + return self.build_check_result( + details=( + f"{details}\n" + f"[NOTE] The current metric value {metric_value} did not trigger any threshold. " + f"Result is considered normal (level={self.default_level})." + ), + level=self.default_level + ) + + def build_check_result(self, + name: Optional[str] = None, + version: Optional[str] = None, + description: Optional[str] = None, + subsystem: Optional[str] = None, + category: Optional[str] = None, + details: Optional[str] = None, + level: Optional[Union[str, LevelEnum]] = None, + solution: Optional[str] = None, + metric_value: Optional[float] = None + ) -> Dict[str, str]: + """ + 构建检查结果 + 遵循以下规则: + 1. 参数优先级:传入参数 > 类属性 + 2. level 与 metric_value互斥 + 3. metric_value 需配合 metric_level_mapper 使用 + 4. 如果metric_level_mapper.map(metric_value)返回None会引起报错 + 5. 自动处理 LevelEnum 转换 + """ + if level is not None and metric_value is not None: + raise ValueError( + f"build_check_result(): 'metric_value' and 'level' are " + f"mutually exclusive parameters" + ) + + result_dict = { + "name": name if name is not None else self.name, + "version": version if version is not None else self.version, + "description": description if description is not None else self.description, + "subsystem": subsystem if subsystem is not None else self.subsystem, + "category": category if category is not None else self.category, + "details": details if details is not None else self.details, + "level": level if level is not None else self.level, + "solution": solution if solution is not None else self.solution + } + + if metric_value is not None: + # 传入 metric_value 参数时,本类属性中必须显示定义 metric_level_mapper,此处严格限制了不查找父类 + metric_level_mapper: MetricLevelMapper = type(self).__dict__.get('metric_level_mapper', None) + # 兼容level为字符串或LevelEnum + if isinstance(result_dict["level"], LevelEnum): + result_dict["level"] = result_dict["level"].name + elif isinstance(result_dict["level"], str): + try: + result_dict["level"] = LevelEnum.from_str(result_dict["level"]).name + except Exception: + pass # 保持原字符串,后续有校验 + + if not isinstance(metric_level_mapper, MetricLevelMapper): + raise ValueError( + f"build_check_result(): When processing metric_value, " + f"'metric_level_mapper' must be defined as a class attribute " + f"in the current class ({type(self).__name__}), " + f"and must be a valid instance of MetricLevelMapper. " + f"Currently, 'metric_level_mapper' is: " + f"{repr(metric_level_mapper)} " + f"(type: {type(metric_level_mapper).__name__}). " + f"Please ensure that 'metric_level_mapper' is explicitly " + f"set in the class body of '{type(self).__name__}', " + f"not inherited from a parent class or missing." + ) + + mapped_level: LevelEnum = metric_level_mapper.map(metric_value) + if mapped_level is None: + thresholds_info = [] + for level in metric_level_mapper._get_levels_order(): + threshold = getattr(metric_level_mapper, level.name.lower()) + thresholds_info.append(f"{level.name}={threshold}") + thresholds_str = ", ".join(thresholds_info) + raise ValueError(f"build_check_result(): mapped_level is None " + f"as metric_value={metric_value} did not match " + f"any threshold in metric_level_mapper. " + f"Thresholds: {thresholds_str}") + + result_dict["level"] = mapped_level.name + + if isinstance(result_dict["level"], LevelEnum): + result_dict["level"] = result_dict["level"].name + + self._validate_result(result_dict) + return result_dict + + def _validate_result(self, result_dict: dict) -> None: + missing_or_empty = [] + for field in self._REQUIRED_FIELDS: + value = result_dict.get(field) + if value is None or (isinstance(value, str) and not value.strip()): + missing_or_empty.append(field) + if missing_or_empty: + raise ValueError(f"build_check_result(): Required fields are " + f"missing or empty: " + f"{', '.join(missing_or_empty)}") + + @abstractmethod + def push_result(self) -> Union[Dict, asyncio.Future, None]: + """子类必须实现的方法""" + pass diff --git a/ops/healthcheck/core/check_item.py b/ops/healthcheck/core/check_item.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbb0caff7082dcba90fb64872f7d72b5418dc6b --- /dev/null +++ b/ops/healthcheck/core/check_item.py @@ -0,0 +1,85 @@ +# core/check_item.py +from abc import ABC, ABCMeta, abstractmethod +import asyncio +from typing import Union, Dict, List, Optional, Any +import traceback +import logging +from core.datacache import DataCache + +logger = logging.getLogger("TOS_Health_Check") + +# 定义一个装饰器来标记方法为 final +def final(method): + method.__is_final__ = True + return method + +# 定义一个元类来检查 final 方法是否被覆盖 +class FinalMeta(ABCMeta): + def __new__(cls, name, bases, namespace): + # 遍历基类 + for base in bases: + for attr_name, attr_value in base.__dict__.items(): + # 检查基类中的方法是否被标记为 final + if getattr(attr_value, '__is_final__', False): + # 如果子类尝试覆盖 final 方法,则抛出异常 + if attr_name in namespace: + raise TypeError(f"Cannot override final method '{attr_name}' in class '{name}'") + return super().__new__(cls, name, bases, namespace) + +# 使用元类 FinalMeta 来创建 CheckItem 类 +class CheckItem(ABC, metaclass=FinalMeta): + """CheckItem 是“插件协议”,定义了插件的基础属性和方法,构成最简单的插件框架。继承该类是实现插件的最低要求。 + 只有仅需基础功能的插件才直接继承 CheckItem 类,因为它无需引入 BasePlugin 的额外逻辑。 + push_result()是插件执行引擎的执行入口,子类只有重写 push_result() 才能返回检查结果。 + 重要约定:禁止修改该类的类属性。一旦改了,所有子类的实例都会跟着变,容易出一些奇怪的 bug。 + """ + + # 子类属性中性重写该属性可声明本类依赖的插件列表,被依赖的插件会先提交运行. + # 此处使用空元组进行初始化主要是防御一部分dependencies类属性被误修改的情况,子类重写时可赋值为列表等可迭代对象。 + dependencies: Optional[List[str]] = () + + def __init__(self, + data_caches_dict: Optional[Dict[str, DataCache]] = None): + # 主程序传递的全局存储数据,子类方法可通过 self.data_caches_dict 使用 + self.data_caches_dict = data_caches_dict + + def get_dependencies(self): + """获取dependencies类属性 + """ + return self.__class__.dependencies + + @abstractmethod + def push_result(self) -> Union[Dict, None, asyncio.Future]: + """子类需要的返回检查结果的方法 + + Returns: + 返回结果可以是字典、None 或 Future + """ + pass + + def initialize(self): + """子类可重写的钩子方法,以在检查操作运行前执行特定操作""" + pass + + def finalize(self): + """子类可重写的钩子方法,以在检查操作运行后执行特定操作""" + pass + + # 不允许子类重写 base_run_check + @final + async def base_run_check(self): + """负责运行插件中的push_result()方法,并确保能处理所有任务类型(同步或者异步)的结果""" + try: + result = self.push_result() + if asyncio.iscoroutine(result): + result = await result + elif isinstance(result, asyncio.Future): + result = await result + else: + # 同步操作放入线程池中执行 + loop = asyncio.get_running_loop() + result = await loop.run_in_executor(None, lambda: result) + return result + except Exception as e: + logger.error(f"{e}") + logger.debug(f"{traceback.format_exc()}") diff --git a/ops/healthcheck/core/config_manager.py b/ops/healthcheck/core/config_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..5c41d7a8c96954c543571f8d1ba7d6ba5c66c6c2 --- /dev/null +++ b/ops/healthcheck/core/config_manager.py @@ -0,0 +1,48 @@ +# core/config_manager.py +from ruamel.yaml import YAML +from typing import Any, Dict + +class ConfigManager: + def __init__(self, config_file: str = "config.yaml"): + """默认使用全局的配置文件 config.yaml 初始化实例""" + self.config_file = config_file + self.config = self._load_config() + + def _load_config(self) -> dict: + yaml = YAML() + with open(self.config_file, 'r') as file: + return yaml.load(file) + + def _save_config(self) -> None: + yaml = YAML() + with open(self.config_file, 'w') as file: + yaml.dump(self.config, file) + + def is_plugin_enabled(self, plugin_name: str) -> bool: + plugin_config = self.config.get(plugin_name, {}) + enabled = plugin_config.get('enabled', False) + if not isinstance(enabled, bool): + print( + f"\033[94mInvalid value for 'enabled' in plugin '{plugin_name}' " + f"in the file '{self.config_file}'. Defaulting to False.\033[0m" + ) + return False + return enabled + + def get_plugin_config(self, plugin_name: str) -> Dict[str, Any]: + """获取插件的完整配置""" + return self.config.get(plugin_name, {}) + + def get_plugin_param(self, plugin_name: str, + param_name: str, default: Any = None) -> Any: + """获取插件配置中的特定参数""" + plugin_config = self.config.get(plugin_name, {}) + return plugin_config.get(param_name, default) + + def set_plugin_param(self, plugin_name: str, + param_name: str, value: Any) -> None: + """设置插件配置中的特定参数""" + if plugin_name not in self.config: + self.config[plugin_name] = {} + self.config[plugin_name][param_name] = value + self._save_config() diff --git a/ops/healthcheck/core/constants.py b/ops/healthcheck/core/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..6a64752992c2c5d2ab07d7f66f55b2f86c99e099 --- /dev/null +++ b/ops/healthcheck/core/constants.py @@ -0,0 +1,15 @@ +#core/constants.py +FILE_PATHS = { + # 内存信息,以mem开头 + "mem_proc_meminfo": "/proc/meminfo", + "mem_proc_vmstat": "/proc/vmstat", + "mem_proc_zoneinfo": "/proc/zoneinfo", + "mem_proc_budyinfo": "/proc/buddyinfo", + "mem_proc_pagetypeinfo": "/proc/pagetypeinfo", + "mem_proc_slabinfo": "/proc/slabinfo", + "mem_sys_kernel_debug_extfrag_extfrag_index": "/sys/kernel/debug/extfrag/extfrag_index", + "mem_sys_kernel_debug_extfrag_unusable_index": "/sys/kernel/debug/extfrag/unusable_index" + # CPU信息文件,以cpu开头 + #"cpu_proc_cpuinfo": "/proc/cpuinfo", + # ... 其他路径 +} diff --git a/ops/healthcheck/core/datacache.py b/ops/healthcheck/core/datacache.py new file mode 100644 index 0000000000000000000000000000000000000000..88eeb8bfb78c73b2d898cca56f93ce72a7c708c7 --- /dev/null +++ b/ops/healthcheck/core/datacache.py @@ -0,0 +1,50 @@ +# core/datacache.py +from abc import ABC, abstractmethod +#import requests +import threading +import time + +class DataFetchStrategy(ABC): + @abstractmethod + def fetch(self): + pass + +class FileFetchStrategy(DataFetchStrategy): + def __init__(self, filepath): + self.filepath = filepath + + def fetch(self): + with open(self.filepath) as f: + return f.read() + +# TODO: 实现时需考虑支持异步 +#class HttpFetchStrategy(DataFetchStrategy): +# def __init__(self, url): +# self.url = url +# +# def fetch(self): +# resp = requests.get(self.url) +# resp.raise_for_status() +# return resp.text + +class DataCache: + def __init__(self, fetch_strategy_instance: DataFetchStrategy, ttl=5): + self._fetch_strategy_instance = fetch_strategy_instance + self._ttl = ttl + self._lock = threading.Lock() + self._data = None + self._last_fetch = 0 + + def get_content(self): + now = time.time() + with self._lock: + if self._data is None or now - self._last_fetch > self._ttl: + self._data = self._fetch_strategy_instance.fetch() + self._last_fetch = now + return self._data + + def refresh_content(self): + with self._lock: + self._data = self._fetch_strategy_instance.fetch() + self._last_fetch = time.time() + return self._data diff --git a/ops/healthcheck/core/logger.py b/ops/healthcheck/core/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..9875280b67fdf425a7e3c025895831954d1bcf16 --- /dev/null +++ b/ops/healthcheck/core/logger.py @@ -0,0 +1,99 @@ +# core/logger.py +import logging +import os +import sys +from core.config_manager import ConfigManager + +class ColoredFormatter(logging.Formatter): + ASNI_COLORS = { + 'DEBUG': '\033[94m', + 'WARNING': '\033[93m', + 'ERROR': '\033[91m', + 'CRITICAL': '\033[95m' + } + RESET_ASNI_COLOR = '\033[0m' + + def format(self, record): + color = self.ASNI_COLORS.get(record.levelname, self.RESET_ASNI_COLOR) + message = super().format(record) + return f"{color}{message}{self.RESET_ASNI_COLOR}" + +def configure_logger(config_manager: ConfigManager, logger_name: str): + logger = logging.getLogger(logger_name) + + if not logger.hasHandlers(): + log_config = config_manager.get_plugin_config(logger_name) + if len(log_config) == 0: + print(f"\033[94mLogger: '{logger_name}' cannot be found in the configuration file " + f"'{config_manager.config_file}'. Apply default settings for logger.\033[0m") + + level = log_config.get("log_level", "INFO").upper() + log_format = log_config.get("log_format", + f"%(asctime)s - %(name)s - " + f"%(levelname)s - %(message)s") + log_file_directory = log_config.get("log_file_directory", "/var/log/t-ops/") + log_file_name = f"{logger_name}.log" + console_enabled = log_config.get("log_console", True) + if not isinstance(console_enabled, bool): + print( + f"\033[94mLogger: Invalid value for 'console' in '{logger_name}' " + f"in the file '{config_manager.config_file}'. Defaulting to True.\033[0m" + ) + console_enabled = True + + developer_mode = log_config.get("developer_mode", False) + if not isinstance(developer_mode, bool): + print( + f"\033[94mLogger: Invalid value for 'developer_mode' in '{logger_name}' " + f"in the file '{config_manager.config_file}'. Defaulting to False.\033[0m" + ) + developer_mode = False + + logger.setLevel(getattr(logging, level, logging.INFO)) + + plain_formatter = logging.Formatter(log_format) + colored_formatter = ColoredFormatter(log_format) + + handlers_to_add = [] + file_logging_enabled = True + + if not isinstance(log_file_directory, str) or not log_file_directory.strip(): + print(f"\033[93mWarning: 'log_file_directory' is invalid or empty. " + f"File logging is disabled.\033[0m") + file_logging_enabled = False + else: + try: + os.makedirs(log_file_directory, exist_ok=True) + log_file_path = os.path.join(log_file_directory, log_file_name) + try: + file_handler = logging.FileHandler(log_file_path) + file_handler.setFormatter(plain_formatter) + handlers_to_add.append(file_handler) + except Exception as e: + print(f"\033[93mWarning: Failed to create log file " + f"'{log_file_path}': {e}. " + f"File logging is disabled.\033[0m") + file_logging_enabled = False + except Exception as e: + print(f"\033[93mWarning: Failed to create log directory " + f"'{log_file_directory}': {e}. " + f"File logging is disabled.\033[0m") + file_logging_enabled = False + + if not file_logging_enabled and not console_enabled: + print(f"\033[93mWarning: File logging is disabled, " + f"force enabling console logging for '{logger_name}'.\033[0m") + console_enabled = True + + if console_enabled: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(colored_formatter) + handlers_to_add.append(console_handler) + + for handler in handlers_to_add: + logger.addHandler(handler) + + if developer_mode: + logger.setLevel(logging.DEBUG) + + return logger diff --git a/ops/healthcheck/core/plugin_manager.py b/ops/healthcheck/core/plugin_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..4d22c3ac598621e083188c2cf3ca0dd174821686 --- /dev/null +++ b/ops/healthcheck/core/plugin_manager.py @@ -0,0 +1,163 @@ +# core/plugin_manager.py +import os +import importlib.util +import traceback +import logging +from typing import Dict, Type, List, Optional +from core.check_item import CheckItem +from core.base_plugin import BasePlugin +from core.config_manager import ConfigManager +from collections import defaultdict, deque +from core.datacache import DataCache + +logger = logging.getLogger("TOS_Health_Check") + +class CircularDependencyError(Exception): + pass + +class PluginManager: + def __init__(self, config_manager: ConfigManager): + self.plugins: Dict[str, Type[CheckItem]] = {} + self.plugin_instances: Dict[str, CheckItem] = {} + self.config_manager = config_manager + self.plugin_times: Dict[str, float] = {} + + def discover_plugins(self, base_dir: str): + for root, _, files in os.walk(base_dir): + for file in files: + if file.endswith(".py") and not file.startswith("__"): + module_name = os.path.splitext(file)[0] + module_path = os.path.join(root, file) + self.load_plugin(module_name, module_path) + + def load_plugin(self, module_name: str, module_path: str): + try: + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + for attr in dir(module): + cls = getattr(module, attr) + if isinstance(cls, type) and issubclass(cls, CheckItem): #and cls is not CheckItem: + if cls is not CheckItem and cls is not BasePlugin: + if module_name in self.plugins: + raise ValueError(f"Duplicate plugin: {module_name}") + self.plugins[module_name] = cls + logger.debug(f"Discovered plugin: '{module_name}'") + except Exception as e: + logger.error(f"Error loading plugin '{module_name}': {e}") + logger.debug(traceback.format_exc()) + + def initialize_plugins(self, + data_caches_dict: Optional[dict[str, DataCache]] = None + ) -> None: + """Initializes enabled plugins and stores their instances. + + Args: + data_caches: Optional dictionary of shared DataCache instances. + """ + plugin_names = list(self.plugins.keys()) + for name in plugin_names: + if self.config_manager.is_plugin_enabled(name): + plugin_cls: CheckItem = self.plugins[name] + try: + # Instantiate the plugin class with shared data caches + instance = plugin_cls(data_caches_dict=data_caches_dict) + instance.initialize() + self.plugin_instances[name] = instance + logger.debug("Initialized plugin: '%s'", name) + except Exception as exc: + logger.error("Error initializing plugin '%s': %s", name, exc) + logger.exception("Exception traceback:") + self.plugins.pop(name, None) + else: + logger.warning( + "Plugin '%s' is not enabled in the configuration.", name + ) + + def resolve_dependencies(self): + logger.debug("Resolving plugins dependencies") + # 只保留 enable 的插件 + enabled_plugins = {} + for name, cls in self.plugins.items(): + if self.config_manager.is_plugin_enabled(name): + enabled_plugins[name] = cls + + removed = True + while removed: + removed = False + to_remove = [] + for name, plugin_cls in enabled_plugins.items(): + dependencies = getattr(plugin_cls, 'dependencies', []) + for dep_name in dependencies: + if dep_name not in enabled_plugins: + logger.warning( + f"Plugin '{name}' dependency '{dep_name}' " + f"not satisfied, will be removed." + ) + to_remove.append(name) + break # 只要有一个依赖不满足就移除 + for name in to_remove: + enabled_plugins.pop(name) + removed = True + + # 构建依赖图和拓扑排序 + graph = defaultdict(list) + in_degree = {} + for name in enabled_plugins: + in_degree[name] = 0 + for name, plugin_cls in enabled_plugins.items(): + dependencies = getattr(plugin_cls, 'dependencies', []) + for dep_name in dependencies: + graph[dep_name].append(name) + in_degree[name] += 1 + + queue = deque([name for name in enabled_plugins if in_degree[name] == 0]) + sorted_plugins = [] + while queue: + node = queue.popleft() + sorted_plugins.append(node) + for neighbor in graph[node]: + in_degree[neighbor] -= 1 + if in_degree[neighbor] == 0: + queue.append(neighbor) + + if len(sorted_plugins) != len(enabled_plugins): + missing = set(enabled_plugins.keys()) - set(sorted_plugins) + logger.error(f"Circular dependency detected in plugins: '{missing}'") + raise CircularDependencyError( + f"Circular dependency detected among plugins. Problematic: '{missing}'" + ) + + # 只保留 enable 且排序后的插件 + self.plugins = {} + for name in sorted_plugins: + self.plugins[name] = enabled_plugins[name] + logger.debug("Resolved plugin dependencies: %s", ', '.join(self.plugins.keys())) + + def finalize_plugins(self): + for name in self.plugins: + instance = self.plugin_instances.get(name) + if instance is not None: + try: + instance.finalize() + logger.debug(f"Finished plugin: '{name}'") + except Exception as e: + logger.error(f"Error cleaning up plugin '{name}': {e}") + logger.debug(traceback.format_exc()) + + def get_plugin_instances(self): + # 只返回依赖解析后剩下的插件实例 + result = {} + for name in self.plugins: + if name in self.plugin_instances: + result[name] = self.plugin_instances[name] + return result + + def record_plugin_time(self, plugin_name: str, + start_time: float, end_time: float): + self.plugin_times[plugin_name] = end_time - start_time + + def print_plugin_times(self): + for plugin_name, duration in self.plugin_times.items(): + logger.debug(f"Plugin '{plugin_name}' " + f"ran for {duration:.2f}s.") diff --git a/ops/healthcheck/core/result_formatter.py b/ops/healthcheck/core/result_formatter.py new file mode 100644 index 0000000000000000000000000000000000000000..a95c1f0dbe1b062ef776a7a35e68add35529040e --- /dev/null +++ b/ops/healthcheck/core/result_formatter.py @@ -0,0 +1,76 @@ +# core/result_formatter.py +import json +from typing import List, Dict +import textwrap + +def wrap_line(line: str, width: int = 80) -> str: + if len(line) <= width: + return line + return '\n'.join( + textwrap.wrap( + line, + width=width, + break_long_words=False, + replace_whitespace=False + ) + ) + +def format_output(app_name: str, app_version: str, check_time: str, + results: List[Dict[str, str]], output_format: str = "text") -> str: + if output_format == "json": + return format_output_json(app_name, app_version, + check_time, results) + else: + return format_output_text(app_name, app_version, + check_time, results) + +def format_output_text(app_name: str, app_version: str, + check_time:str, results: List[Dict[str, str]]) -> str: + output = f"\n{app_name} v{app_version}\n" + output += f"Check time: {check_time}\n" + output += "=" * (len(app_name) + len(app_version) + len(check_time) + 1) +f"\n" + for result in results: + output += f"\n" + "-" * 40 + f"\n" + output += f"Name: {result['name']}\n" + output += f"Version: {result['version']}\n" + output += f"Level: {result['level']}\n" + output += f"Description: {wrap_line(result['description'])}\n" + output += f"Subsystem: {result['subsystem']}\n" + output += f"Category: {result['category']}\n" + + details_content = result['details'] + output += "Details: \n" + output += "####################\n" + for line in details_content.splitlines(): + output += wrap_line(line) + "\n" + output += "####################\n" + + output += f"Solution: {wrap_line(result['solution'])}\n" + + return output + +def format_output_json(app_name: str, app_version: str, check_time: str, + results: List[Dict[str, str]]) -> str: + # 重新排序每个结果字典 + ordered_results = [] + for result in results: + ordered_result = { + "Name": result["name"], + "Version": result["version"], + "Level": result["level"], + "Description": result["description"], + "Subsystem": result["subsystem"], + "Category": result["category"], + "Details": result["details"], + "Solution": result["solution"] + } + ordered_results.append(ordered_result) + + output = { + "app_name": app_name, + "app_version": app_version, + "check_time": check_time, + "results": ordered_results + } + + return json.dumps(output, indent=4, ensure_ascii=False) diff --git a/ops/healthcheck/core/result_validator.py b/ops/healthcheck/core/result_validator.py new file mode 100644 index 0000000000000000000000000000000000000000..e25a0f59b2b22e671992ab4c1eb306945f6e9bb8 --- /dev/null +++ b/ops/healthcheck/core/result_validator.py @@ -0,0 +1,74 @@ +# core/result_validator.py +import re +from typing import Optional + +def generate_result_str(result): + return "\n".join([f"{key}: {value}" for key, value in result.items()]) + +def validate_result(result: Optional[dict]): + + # Check for result type + if not isinstance(result, (dict, type(None))): + raise TypeError(f"Invalid result from 'push_result': {type(result)}. " + f"Expected dict or None." + ) + + if isinstance(result, type(None)): + return + + required_keys = ["name", + "version", + "level", + "details", + "description", + "solution", + "subsystem", + "category"] + valid_levels = ("OK", "INFO", "WARNING", "ERROR", "CRITICAL") + valid_categories = ("utilization", "saturation", + "error", "others") + version_pattern = re.compile(r'^\d+\.\d+\.[a-z0-9].*$') + + # Check for missing keys + missing_keys = [key for key in required_keys if key not in result] + if missing_keys: + result_str = generate_result_str(result) + raise ValueError(f"Result missing required keys: " + f"{', '.join(missing_keys)}. " + f"Fetched result:\n{result_str}") + + # Check for valid level + level = result.get("level") + if level not in valid_levels: + result_str = generate_result_str(result) + raise ValueError(f"Result has invalid 'Level': " + f"{level}. Valid levels are: " + f"{', '.join(valid_levels)}. " + f"Fetched result:\n{result_str}") + + # Check for valid category + category = result.get("category") + if category not in valid_categories: + result_str = generate_result_str(result) + raise ValueError(f"Result has invalid category: " + f"{category}. Valid categories are: " + f"{', '.join(valid_categories)}. " + f"Fetched result:\n{result_str}") + + # Check for valid version format + version = result.get("version") + if not version_pattern.match(version): + result_str = generate_result_str(result) + raise ValueError(f"Result has invalid version: {version}. " + f"Version must be in the format 'X.Y.Z' " + f"where X and Y are numbers. " + f"Fetched result:\n{result_str}") + + # Check if other fields are non-empty strings + for key in ["name", "details", "description", "solution", "subsystem"]: + value = result.get(key) + if not isinstance(value, str) or not value.strip(): + result_str = generate_result_str(result) + raise ValueError(f"Result has invalid value for {key}: {value}. " + f"It must be a non-empty string. " + f"Fetched result:\n{result_str}") diff --git a/ops/healthcheck/main.py b/ops/healthcheck/main.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe41b0e36141daca5f0b17c069319979b42b9c1 --- /dev/null +++ b/ops/healthcheck/main.py @@ -0,0 +1,137 @@ +# main.py +import os +import sys +import asyncio +import traceback +import time +import argparse +import logging +from typing import Optional, Dict, Any +from core.config_manager import ConfigManager +from core.logger import configure_logger +from core.constants import FILE_PATHS +from core.datacache import FileFetchStrategy, DataCache +from core.check_item import CheckItem +from core.plugin_manager import PluginManager +from core.result_formatter import format_output +from core.result_validator import validate_result + +async def run_check(plugin_name: str, + plugin_instance: CheckItem | Any, + plugin_manager: PluginManager, + logger: logging.Logger) -> Optional[Dict[str, Any]]: + start_time = time.time() + try: + logger.debug(f"Running plugin: '{plugin_name}'") + result = await plugin_instance.base_run_check() + validate_result(result) + return result + + except Exception as e: + logger.error(f"Error running check '{plugin_name}': {e}") + logger.debug(traceback.format_exc()) + return None + + finally: + end_time = time.time() + plugin_manager.record_plugin_time(plugin_name, start_time, end_time) + + +def get_base_dir(): + """ + Determine the base directory for resource files. + If running as a bundled executable (e.g., packaged by PyInstaller), + return the directory where the executable is located. + Otherwise, return the directory where the current script file resides. + """ + if getattr(sys, 'frozen', False): + # Running as a bundled executable + return os.path.dirname(os.path.abspath(sys.executable)) + else: + # Running as a standard Python script + return os.path.dirname(os.path.abspath(__file__)) + +def get_and_check_resource_paths(): + base_dir = get_base_dir() + config_file = os.path.join(base_dir, "config.yaml") + checks_dir = os.path.join(base_dir, "checks") + + missing = [] + if not os.path.isfile(config_file): + missing.append(f"Config file: '{config_file}'") + if not os.path.isdir(checks_dir): + missing.append(f"Checks directory: '{checks_dir}'") + if missing: + msg = f"The following required resources are missing in base directory {base_dir}:\n" + msg += "\n".join(f" - {item}" for item in missing) + raise FileNotFoundError(msg) + return config_file, checks_dir + +async def main(output_format: str): + app_name = "OS Health Check" + app_version = "4.2.1" + check_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + data_caches_dict = {} + + try: + config_file, checks_dir = get_and_check_resource_paths() + except FileNotFoundError as e: + print(e) + sys.exit(1) + + # Initialize config manager + config_manager = ConfigManager(config_file) + + # Initialize main application logger + logger = configure_logger(config_manager, "TOS_Health_Check") + + # Initialize shared data caches for each file if not already present + for file_path_name, file_path in FILE_PATHS.items(): + if file_path_name not in data_caches_dict: + data_caches_dict[file_path_name] = DataCache(FileFetchStrategy(file_path), ttl=5) + + # Create the plugin manager and discover available plugins in the checks directory + plugin_manager = PluginManager(config_manager=config_manager) + plugin_manager.discover_plugins(checks_dir) + + # Initialize all discovered plugins, passing the shared data caches to each instance + plugin_manager.initialize_plugins(data_caches_dict=data_caches_dict) + + # Resolve dependencies + plugin_manager.resolve_dependencies() + + # Run enabled plugins and collect results + tasks = [] + for plugin_name, plugin_instance in \ + plugin_manager.get_plugin_instances().items(): + if config_manager.is_plugin_enabled(plugin_name): + tasks.append( + run_check(plugin_name, plugin_instance, plugin_manager, logger) + ) + + results = await asyncio.gather(*tasks) + results = [result for result in results if result is not None] + + # Finalize plugins + plugin_manager.finalize_plugins() + + # Print plugin run times + plugin_manager.print_plugin_times() + + # Format and print the output + output = format_output(app_name, + app_version, + check_time, + results, + output_format) + # Display output + logger.info(f"\n{output}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run OS Health Check") + parser.add_argument("--format-output", "-f", choices=["text", "json"], + default="text", help="Output format text or json") + args = parser.parse_args() + + asyncio.run(main(args.format_output)) + diff --git a/ops/healthcheck/ops-help b/ops/healthcheck/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a748820415a06df166eb9fc01b793039ed73860f --- /dev/null +++ b/ops/healthcheck/ops-help @@ -0,0 +1 @@ +OS health check \ No newline at end of file diff --git a/ops/healthcheck/ops-run b/ops/healthcheck/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..5b0a2361dc0706e206b061e8f2a3c5dbc6257022 --- /dev/null +++ b/ops/healthcheck/ops-run @@ -0,0 +1 @@ +tos-health-check \ No newline at end of file diff --git a/ops/healthcheck/requirements.txt b/ops/healthcheck/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc548809aad5952b6809b4c31f6de189d0082cfc --- /dev/null +++ b/ops/healthcheck/requirements.txt @@ -0,0 +1,2 @@ +ruamel.yaml==0.18.10 +ruamel.yaml.clib==0.2.12 diff --git a/ops/healthcheck/setup_dev_env.sh b/ops/healthcheck/setup_dev_env.sh new file mode 100755 index 0000000000000000000000000000000000000000..38d062e67f04e64dc62dd1a4148f904127a2c332 --- /dev/null +++ b/ops/healthcheck/setup_dev_env.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# TOS Health Check Dev Environment Setup Script +# # Automates virtual environment creation and requirements installation with: +# - Python 3.11 validation +# - Tencent PyPI mirror configuration +# - Virtual environment initialization +# - Dependency installation from requirements.txt + +set -euo pipefail + +# Format messages +err() { + echo "[Error]: $@" >&2 +} +info() { + echo "[Info]: $@" +} + +# Configuration +PYTHON_VERSION="3.11" +VENV_DIR=".venv" +PYPI_MIRROR="http://mirrors.tencent.com/tencent_pypi/simple/" +TRUSTED_HOST="mirrors.tencent.com" +PYTHON_SOURCE_URL="https://www.python.org/ftp/python" +FORCE=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --force) + FORCE=true + shift + ;; + *) + err "Unknown option: $1" + exit 1 + ;; + esac +done + +info "" +info "===== Starting Deployment =====" +info "Python Package Index Source: $PYPI_MIRROR" +info "Python Source: $PYTHON_SOURCE_URL" +info "" + + +# Mandatory requirements check +if [ ! -f "requirements.txt" ]; then + err "requirements.txt not found" + exit 1 +fi + +# Check existing virtual environment +if [[ -f "${VENV_DIR}/bin/activate" ]]; then + if [[ "$FORCE" == "true" ]]; then + info "Existing virtual environment detected. Forcing recreation due to --force..." + #rm -rf "${VENV_DIR}" + else + err "Virtual environment already exists at $(pwd)/${VENV_DIR}, " + err "using '--force' option if you would like to overwrite it." + exit 1 + fi +fi + +# Create virtual environment +info "[1/5] Creating virtual environment..." +rm -rf ${VENV_DIR} +if command -v ./python3.11-build/usr/local/bin/python3.11; then + info "Using ./python3.11-build/usr/local/bin/python3.11" + ./python3.11-build/usr/local/bin/python3.11 -m venv --clear "${VENV_DIR}" + export LD_LIBRARY_PATH=$(pwd)/python3.11-build/usr/local/lib:$LD_LIBRARY_PATH +elif command -v python3.11; then + info "Using $(which python3.11)" + python3.11 -m venv --clear "$VENV_DIR" +else + err "python3.11 not available." + cat << 'EOF' +############################################################################### +You can simply install Python 3.11 via 'yum install python3.11', or alternatively, +choose to build it from source. While the latter is more involved, it ensures no +changes to the system environment with installing all binaries and libraries to +an isolated directory, avoiding any conflicts with the system-default +Python (e.g., /usr/bin/python3). Here's how to do it: +1.) # 安装libffi-dev + yum install libffi-devel +2.) # 下载源码包, 根据实际情况替换 ,已验证 Python3.11.12 可用,但是鼓励尝试使用最新版本 python3.11 + wget https://www.python.org/ftp/python/3.11./Python-3.11..tgz +3.) # 创建编译产物的目录(不要修改目录的路径与名称) + mkdir python3.11-build +4.) # 解压文件至项目根目录 + tar xzf Python-3.11..tgz +5.) cd Python-3.11. +6.) CFLAGS="-Wno-error" ./configure --prefix=/usr/local --enable-optimizations --with-system-ffi --enable-shared +7.) make -j$(nproc) +8.) make install DESTDIR=../python3.11-build +9.) export LD_LIBRARY_PATH=$(pwd)/../python3.11-build/usr/local/lib:$LD_LIBRARY_PATH +10.) # 将编译的 python3.11 添加至 PATH 环境变量(可选操作步骤) + export PATH=$(pwd)/../python3.11-build/usr/local/bin:$PATH +11.) # 验证Python版本 + ../python3.11-build/usr/local/bin/python3.11 -c "import platform; print(platform.python_version())" +EOF + exit 1 +fi + +# Activate environment +info "[2/5] Activating virtual environment..." +source "$VENV_DIR/bin/activate" + +# Configure pip +info "[3/5] Configuring pip (Tencent PyPI mirrors)..." +python -m pip install \ + --index-url "$PYPI_MIRROR" \ + --trusted-host "$TRUSTED_HOST" \ + --no-cache-dir \ + "pip>=25,<26" + +info "[4/5] Configuring pyinstaller (Tencent PyPI mirrors)..." +python -m pip install \ + --index-url "$PYPI_MIRROR" \ + --trusted-host "$TRUSTED_HOST" \ + --no-cache-dir \ + "pyinstaller>=6.13,<6.14" + +# Install requirements +info "[5/5] Installing requirements..." +python -m pip install \ + --index-url "$PYPI_MIRROR" \ + --trusted-host "$TRUSTED_HOST" \ + --no-cache-dir \ + -r requirements.txt + +info "" +info "===== Deployment Complete =====" +info "Python: $(python --version 2>&1)" +info "pip: $(python -m pip --version | awk '{print $2}')" +info "Virtual env: source $(pwd)/$VENV_DIR/bin/activate" +info "" diff --git a/ops/healthcheck/tos-health-check b/ops/healthcheck/tos-health-check new file mode 100755 index 0000000000000000000000000000000000000000..b495d1987127688f31dbc35cd2cc639f3869aca2 Binary files /dev/null and b/ops/healthcheck/tos-health-check differ diff --git a/ops/healthcheck/utilities/file_content_utils.py b/ops/healthcheck/utilities/file_content_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2a7406148e2edbc229a6001071c392077b369a --- /dev/null +++ b/ops/healthcheck/utilities/file_content_utils.py @@ -0,0 +1,120 @@ +# utilities/file_content_utils.py +from typing import Dict, Any + +def parse_common_key_value_content(content: str) -> Dict[str, Any]: + """Parses a multi-line string where each line contains a key in the first column and a value in the second column. + + Each line is expected to be in the format: + [unit] + + The key is taken from the first column (with any trailing colon removed), and the value is taken from the second column. + The function attempts to convert the value to float; if conversion fails, the value is kept as a string. + Lines with fewer than two columns are ignored. + + Args: + content: Multi-line string to parse, where each line contains a key and a value. + + Returns: + A dictionary mapping each key (str) to its value (float if possible, otherwise str). + """ + result = {} + for line in content.splitlines(): + line = line.strip() + if not line: + continue # Skip empty lines + parts = line.split() + if len(parts) < 2: + continue # Skip malformed lines + key = parts[0].rstrip(':') + value_str = parts[1] + # Try to convert value to float, fallback to str + try: + value = float(value_str) + except ValueError: + value = value_str + result[key] = value + return result + +import re +from collections import defaultdict + +def parse_proc_zoneinfo(content: str) -> dict[str, dict[str, dict[str, float]]]: + """ + Parse /proc/zoneinfo content and return structured data. + + Args: + content (str): The content of /proc/zoneinfo + + Returns: + dict: A nested dictionary with structure: + { + node_id: { + zone_name: { + 'min': int, + 'low': int, + 'high': int, + 'free': int + }, + ... + }, + ... + } + """ + # Regular expression patterns + node_pattern = re.compile(r'Node (\d+), zone\s+(\w+)') + pages_free_pattern = re.compile(r'pages free\s+(\d+)') + min_pattern = re.compile(r'min\s+(\d+)') + low_pattern = re.compile(r'low\s+(\d+)') + high_pattern = re.compile(r'high\s+(\d+)') + + result = defaultdict(dict) + current_node = None + current_zone = None + + for line in content.split('\n'): + line = line.strip() + + # Check for Node and zone line + node_match = node_pattern.match(line) + if node_match: + current_node = node_match.group(1) + current_zone = node_match.group(2) + result[current_node][current_zone] = { + 'min': None, + 'low': None, + 'high': None, + 'free': None + } + continue + + # Skip if we're not in a zone section + if current_node is None or current_zone is None: + continue + + # Check for pages free + if 'pages free' in line: + free_match = pages_free_pattern.search(line) + if free_match: + result[current_node][current_zone]['free'] = int(free_match.group(1)) + continue + + # Check for watermarks + if 'min' in line: + min_match = min_pattern.search(line) + if min_match: + result[current_node][current_zone]['min'] = int(min_match.group(1)) + continue + + if 'low' in line: + low_match = low_pattern.search(line) + if low_match: + result[current_node][current_zone]['low'] = int(low_match.group(1)) + continue + + if 'high' in line: + high_match = high_pattern.search(line) + if high_match: + result[current_node][current_zone]['high'] = int(high_match.group(1)) + continue + + return dict(result) diff --git a/ops/healthcheck/utilities/runcmd.py b/ops/healthcheck/utilities/runcmd.py new file mode 100644 index 0000000000000000000000000000000000000000..852ea67303402cb4e97ac44bd084e431d29be60f --- /dev/null +++ b/ops/healthcheck/utilities/runcmd.py @@ -0,0 +1,266 @@ +# utilities/runcmd.py +import subprocess +import select +import threading +import logging +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Tuple, Callable, Optional + +logger = logging.getLogger("TOS_Health_Check") + +def runcmd_stream_output(command: str, + timeout: float, + func_stream: Callable, + func_stream_kwargs: dict) -> Tuple[str, str, Optional[int]]: + + stderr_lines = [] + return_code = None + process = None + process_lock = threading.Lock() + process_inactive_event = threading.Event() + timer = None + + stdout_lines = [] + + def _callback_wrapper(line): + func_stream(line, **func_stream_kwargs) + stdout_lines.append(line) + + # Set a timer to terminate the process if timeout + def _timeout_handler(): + with process_lock: + if not process_inactive_event.is_set() and process and process.poll() is None: + process_inactive_event.set() + logger.info(f"Command '{command}' timeout in threshold {timeout}s, " + f"sending SIGTERM to terminate") + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + + env = { + "PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin", + "LANG": "C", + "LC_ALL": "C", + "TERM": "xterm-256color" + } + + try: + with process_lock: + process = subprocess.Popen( + ["/bin/sh", "-c", command], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + bufsize=1, + text=True, + errors='replace' + ) + + timer = threading.Timer(timeout, _timeout_handler) + timer.start() + + while not process_inactive_event.is_set(): + with process_lock: + if process.poll() is not None: + break + + fds = [] + if process.stdout: + fds.append(process.stdout) + if process.stderr: + fds.append(process.stderr) + if not fds: + break + + try: + # TODO: select timeout should be configurable + readable, _, _ = select.select(fds, [], [], 0.5) + if not readable: + continue + except ValueError: + break + + for fd in readable: + try: + line =fd.readline() + if not line: + continue + if process.stdout and fd == process.stdout: + # TODO: 考虑是否将输出记录到日志 + # logger.debug(f"stdout: {line.strip()}") + # callback funcion to stream output function + _callback_wrapper(line) + elif process.stderr and fd == process.stderr: + logger.info(f"Command {command} : stderr output {line}") + stderr_lines.append(line) + except ValueError: + process_inactive_event.set() + break + + # Try to drain remaining data from process buffers (non-blocking) + def _drain_remaining_data(fd, callback): + if not fd: + return + + while not process_inactive_event.is_set(): + try: + readable, _, _ = select.select([fd], [], [], 0) + if not readable: + break + line = fd.readline() + if not line: + break + callback(line) + except ValueError: + break + + with process_lock: + _drain_remaining_data(process.stdout, _callback_wrapper) + _drain_remaining_data(process.stderr, lambda line: stderr_lines.append(line)) + + except Exception as e: + logger.error(f"Exception occurred while running command '{command}': {str(e)}") + # finally逻辑会覆盖return_code,此处是防止return_code为None + return_code = -1 + process_inactive_event.set() + + finally: + if timer: + timer.cancel() + + with process_lock: + if process: + process_inactive_event.set() + process.terminate() + try: + return_code = process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + return_code = process.wait() + + if process.stdout: + try: + process.stdout.close() + except Exception: + pass + + if process.stderr: + try: + process.stderr.close() + except Exception: + pass + else: + return_code = -1 + + return "".join(stdout_lines), "".join(stderr_lines), return_code + +count = 0 +def count_lines(line): + print(line.rstrip()) + global count + count +=1 + +#runcmd_stream_output("mpstat -P ALL 1", 10, count_lines, {}) +#print(count) + +def execute_command(command: str, + timeout: float = 60, + env: Optional[dict] = None, + cwd: Optional[str] = None,) -> Tuple[str, str, int]: + + stdout, stderr = "", "" + return_code = -1 + process = None + base_env = { + "PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin", + "LANG": "C.UTF-8", + "LC_ALL": "C.UTF-8", + "PYTHONUNBUFFERED": "1" + } + final_env = {**base_env, **(env or {})} + + def _terminate_process(process: subprocess.Popen) -> None: + try: + if process and process.poll() is None: + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + logger.warning(f"Terminating COMM:{command} timeout: threshold {timeout}s, sending SIGKILL") + process.kill() + + try: + process.wait(timeout=5) + except Exception: + pass + + except ProcessLookupError: + pass + except Exception as e: + logger.warning(f"Terminating COMM:{command} unknown exception PID:{getattr(process, 'pid', None)}: {str(e)}") + + def _safe_cleanup(process: Optional[subprocess.Popen]) -> None: + if not process: + return + try: + for pipe in [getattr(process, 'stdout', None), getattr(process, 'stderr', None)]: + if pipe: + try: + pipe.close() + except Exception as e: + logger.debug(f"Unknown exception when closing child process stdout/stderr: {str(e)}") + if process.poll() is None: + process.kill() + try: + process.wait(timeout=5) + except Exception: + pass + except Exception as e: + logger.debug(f"Cleanup exception: {str(e)}") + + def _safe_decode(data: bytes | str | None) -> str: + if isinstance(data, str): + return data + if isinstance(data, bytes): + return data.decode('utf-8', errors='replace') + return "" + + try: + process = subprocess.Popen(["/bin/sh", "-c", command], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=final_env, + cwd=cwd, + bufsize=1, + encoding='utf-8', + errors='replace', + text=True, + ) + + try: + stdout, stderr = process.communicate(timeout=timeout) + return_code = process.returncode if process.returncode is not None else -1 + except subprocess.TimeoutExpired as e: + stdout = _safe_decode(getattr(e, 'stdout', None)) + stderr = _safe_decode(getattr(e, 'stderr', None)) + logger.error(f"Command '{command} timeout, " + f"threshold {timeout}s " + f"PID: {getattr(process, 'pid', None)}") + return_code = -1 + _terminate_process(process) + except Exception as e: + logger.error( + f"Command failed: {command} - {type(e).__name__}: {str(e)}", + exc_info=True, + stack_info=True + ) + return_code = -1 + finally: + _safe_cleanup(process) + if return_code is None: + return_code = -1 + + return stdout, stderr, return_code \ No newline at end of file diff --git a/ops/interaction/cpu/frequency/ops-help b/ops/interaction/cpu/frequency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..29c4f53b83aff88a9ad2e44e4ab0448d492f824a --- /dev/null +++ b/ops/interaction/cpu/frequency/ops-help @@ -0,0 +1 @@ +cpu: show cpu frequency (default cpu 0; assign cpu: -c cpulist, such as-c 1-3) /\*读取cpu频率*/\ diff --git a/ops/interaction/cpu/frequency/ops-run b/ops/interaction/cpu/frequency/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..5fd69188d4cfd7f88d1707e07f947c06a4c31de0 --- /dev/null +++ b/ops/interaction/cpu/frequency/ops-run @@ -0,0 +1,28 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show cpu frequency (default cpu 0; assign cpu: -c cpulist, such as-c 1-3)\033[0m\n" + #------ frequency ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $# -le 0 ]; then + $dir/oc-ops kernel-tools cpupower frequency-info + else : + $dir/oc-ops kernel-tools cpupower $* frequency-info + fi + +} +main $* diff --git a/ops/interaction/cpu/interrup_latency/ops-help b/ops/interaction/cpu/interrup_latency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..6e175fe18a3aa5ca0a627f1d1f01e6bed4dee4e0 --- /dev/null +++ b/ops/interaction/cpu/interrup_latency/ops-help @@ -0,0 +1 @@ +cpu: show interrupt latency /\*统计中断时延, 可选配置监控时间及irq超时时延(超时打印stack)*/\ diff --git a/ops/interaction/cpu/interrup_latency/ops-run b/ops/interaction/cpu/interrup_latency/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..613ffa1abe1e3b303a3d0a41294053366e86b926 --- /dev/null +++ b/ops/interaction/cpu/interrup_latency/ops-run @@ -0,0 +1,51 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show interrupt latency\033[0m\n" + printf "\033[35m[1]To control how long to run this, need:\$run_time, unit s;\033[0m\n" + printf "\033[32muses: oc-ops interaction cpu interrup_latency 30\033[0m\n" + printf "\033[35m[2]To print irq stack whoes run time > timeout, need:[\$run_time], \$time_out:unit ms\033[0m\n" + printf "\033[32muses: oc-ops interaction cpu interrup_latency 30 100\033[0m\n" + + printf "\033[35m\nTo running now, default parameter\033[0m\n" + #------ interrupt ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops cpu irq latency -c + if [ $# == 0 ]; then + $dir/oc-ops cpu irq latency -e 1 -q 30 + elif [ $# == 1 ]; then + $dir/oc-ops cpu irq latency -e 1 -q $1 + else : + $dir/oc-ops cpu irq latency -e 1 -q $1 -t $2 + fi + + result=1 + delay=0 + while [ $result -ne 0 ]; do + result=$(cat /proc/irq_latency/enable) + sleep 1 + delay=$(($delay + 1)) + val=`expr $delay % 10` + if [ $val == 0 ]; then + echo ------------show data, $delay-------------- + cat /proc/irq_latency/trace_dist + cat /proc/irq_latency/trace_stack + fi + done + +} +main $* diff --git a/ops/interaction/cpu/ops-help b/ops/interaction/cpu/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..bd51fbb717c028a4f42809d3054a87a18c62a450 --- /dev/null +++ b/ops/interaction/cpu/ops-help @@ -0,0 +1 @@ +HCI: scheduler and irq tools .etc diff --git a/ops/interaction/cpu/runqlatency/ops-help b/ops/interaction/cpu/runqlatency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c438085e4fa4b215b65daaabc1df8f3a883decb5 --- /dev/null +++ b/ops/interaction/cpu/runqlatency/ops-help @@ -0,0 +1 @@ +cpu: summarize run queue (scheduler) latency as a histogram /\*统计调度器运行队列时延,直方图形式显示*/\ diff --git a/ops/interaction/cpu/runqlatency/ops-run b/ops/interaction/cpu/runqlatency/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..dfc55f632b3ebdccd1a0bfac97b8f14bbae25f89 --- /dev/null +++ b/ops/interaction/cpu/runqlatency/ops-run @@ -0,0 +1,32 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:summarize run queue (scheduler) latency as a histogram\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqlatency [\$interval] [\$pid]\033[0m\n" + #------ frequency ------ + if [ $# -le 0 ]; then + printf "\033[32mdefault:10s summaries, and with timestamps\033[0m\n" + /usr/share/bcc/tools/runqlat -T 10 -P + elif [ $# -eq 1 ]; then + printf "\033[32mdefault:10s summaries, and with timestamps, for \$pid:$1\033[0m\n" + /usr/share/bcc/tools/runqlat -T 10 -p $1 + elif [ $# -eq 2 ]; then + printf "\033[32minteval:$1s summaries, and with timestamps, for pid:$2\033[0m\n" + /usr/share/bcc/tools/runqlat -T $1 -p $2 + fi + +} +main $* diff --git a/ops/interaction/cpu/runqlen/ops-help b/ops/interaction/cpu/runqlen/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..66082da693c4c09b5de9d108364696c77f9cf9a5 --- /dev/null +++ b/ops/interaction/cpu/runqlen/ops-help @@ -0,0 +1 @@ +cpu: summarize scheduler run queue length as a histogram /\*统计调度器运行队列长度,直方图形式显示*/\ diff --git a/ops/interaction/cpu/runqlen/ops-run b/ops/interaction/cpu/runqlen/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..c6d023d5859a9ef72a09376c6d48576a8fc8e4c9 --- /dev/null +++ b/ops/interaction/cpu/runqlen/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:summarize scheduler run queue length as a histogram\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqlen [cpu]\033[0m\n" + #------ frequency ------ + if [ $# -le 0 ]; then + printf "\033[32mdefault:6s summarize run queue length as a histogram for all cpu\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqlen\033[0m\n" + /usr/share/bcc/tools/runqlen -T 6 + elif [ $# -eq 1 ]; then + printf "\033[32mdefault:6s summaries, show each cpu\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqlen cpu\033[0m\n" + /usr/share/bcc/tools/runqlen -T 6 -C + fi + +} +main $* diff --git a/ops/interaction/cpu/runqslower/ops-help b/ops/interaction/cpu/runqslower/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..374cb435a0c1d785b915a2f279d2c15ae5be1e7a --- /dev/null +++ b/ops/interaction/cpu/runqslower/ops-help @@ -0,0 +1 @@ +cpu: trace high run queue latency /\*跟踪运行队列超过多少us的时延,缺省10000us*/\ diff --git a/ops/interaction/cpu/runqslower/ops-run b/ops/interaction/cpu/runqslower/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..c0103a0a1c6e83b24f6f5c91e8afa0c4583f90e1 --- /dev/null +++ b/ops/interaction/cpu/runqslower/ops-run @@ -0,0 +1,42 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:trace high run queue latency\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqslower [\$latency] [-p \$pid]\033[0m\n" + printf "\033[32msuch as :oc-ops interaction cpu runqslower\033[0m\n" + printf "\033[32msuch as, latency > 1000us:oc-ops interaction cpu runqslower 1000\033[0m\n" + printf "\033[32msuch as, trace one pid :oc-ops interaction cpu runqslower -p \$pid\033[0m\n" + printf "\033[32msuch as, latench and pid :oc-ops interaction cpu runqslower 1000 -p \$pid\033[0m\n" + #------ frequency ------ + if [ $# -le 0 ]; then + printf "\033[32mdefault:trace high run queue latency, default 10000us\033[0m\n" + /usr/share/bcc/tools/runqslower + elif [ $# -eq 1 ]; then + printf "\033[32mdefault:trace high run queue latency:$1\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqslower \$latency\033[0m\n" + /usr/share/bcc/tools/runqslower $1 + elif [ $# -eq 2 ]; then + printf "\033[32mdefault:trace high run queue latency, default 10000us, pid:\$pid\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqslower -p \$pid\033[0m\n" + /usr/share/bcc/tools/runqslower -p $1 + elif [ $# -eq 3 ]; then + printf "\033[32mdefault:trace high run queue latency:$1us, pid:$3\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu runqslower -p \$pid\033[0m\n" + /usr/share/bcc/tools/runqslower $1 -p $3 + fi + +} +main $* diff --git a/ops/interaction/cpu/stat_interrupt/ops-help b/ops/interaction/cpu/stat_interrupt/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..e5921293bdeef79d8847ef16df7398da771b2916 --- /dev/null +++ b/ops/interaction/cpu/stat_interrupt/ops-help @@ -0,0 +1 @@ +cpu: show interrupt latency /\*统计中断数量等情况*/\ diff --git a/ops/interaction/cpu/stat_interrupt/ops-run b/ops/interaction/cpu/stat_interrupt/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..2703b6f09aa13f151fba856092ebb05146d6044c --- /dev/null +++ b/ops/interaction/cpu/stat_interrupt/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show interrupt latency\033[0m\n" + + #------ interrupt ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops cpu irq stat +} +main $* diff --git a/ops/interaction/fs/dir_read_write_stat/ops-help b/ops/interaction/fs/dir_read_write_stat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..65422956c45a70d11d8c3d1b46eb5f89044a9a02 --- /dev/null +++ b/ops/interaction/fs/dir_read_write_stat/ops-help @@ -0,0 +1 @@ +fs: show rw speed/bytes of one dir /\*显示指定目录的单位时间的读写次数及bytes数*/\ diff --git a/ops/interaction/fs/dir_read_write_stat/ops-run b/ops/interaction/fs/dir_read_write_stat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..4935edbc2534014ddc157363b32ae2a27596c6c3 --- /dev/null +++ b/ops/interaction/fs/dir_read_write_stat/ops-run @@ -0,0 +1,33 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show rw numbers/bytes of one dir\033[0m\n" + + #------ read/write numbers and bytes at interval time ------ + if [ $# -lt 2 ]; then + printf "\033[32mplease assign dir/interval/[process]:oc-ops interaction fs dir_read_write_stat \$dir \$inverval [\$process]\033[0m\n" + return + fi + + if [ $# == 2 ]; then + printf "\033[32msee dir:$1, interval:$2\033[0m\n" + /usr/share/bcc/tools/dirtop -d $1 $2 + elif [ $# == 3 ]; then + printf "\033[32msee dir:%1, interval:$2, pid:$3\033[0m\n" + /usr/share/bcc/tools/dirtop -d $1 $2 -p $3 + fi +} +main $* diff --git a/ops/interaction/fs/ext4/ops-help b/ops/interaction/fs/ext4/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c5ac6cf0b8bde7c8404cbc6577e90568a00c8435 --- /dev/null +++ b/ops/interaction/fs/ext4/ops-help @@ -0,0 +1 @@ +fs: ext4 tools /\*ext4 修复工具*/\ diff --git a/ops/interaction/fs/ext4/ops-run b/ops/interaction/fs/ext4/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..3ad96a51ef1277a9b46c555bb56b3e5d67db65c0 --- /dev/null +++ b/ops/interaction/fs/ext4/ops-run @@ -0,0 +1,47 @@ +#!/bin/bash +# -*- coding: utf-8 -*- + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + if [ $# -lt 1 ]; then + printf "\033[31m输入需要查询的能力:\033[0m\n" + printf "\033[32moc-ops interaction fs ext4 help\033[0m\n" + printf "\033[32moc-ops interaction fs ext4 recover xxx(device, as /dev/sda) \*修复文件系统*\ \033[0m\n" + exit 1 + fi + sub="$1" + + if [ -z "$sub" ]; then + echo ---no work find: $sub + exit 1 + fi + + #work + printf "\033[31m-------------find work: $sub\033[0m\n" + + if [ ! -z $(strstr $sub "help") ]; then + printf "\033[32moc-ops interaction fs ext4 recover xxx(device, as /dev/sda) \*修复文件系统*\ \033[0m\n" + printf "\033[32mNormal:oc-ops fs ext4_recover -h\033[0m\n" + + exit 1 + fi + #------ recover fs ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $# -lt 1 ]; then + $dir/oc-ops fs ext4_recover -h + exit 1 + fi + if [ $# -eq 2 ]; then + $dir/oc-ops fs ext4_recover -d $2 + else : + printf "\033[32moc-ops interaction fs ext4 recover xxx(device, as /dev/sda) \*修复文件系统*\ \033[0m\n" + $dir/oc-ops fs ext4_recover -h + fi +} +main $* diff --git a/ops/interaction/fs/file/file_life/ops-help b/ops/interaction/fs/file/file_life/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..8b74cb25d31bfe3346c84160461fa62a84cff769 --- /dev/null +++ b/ops/interaction/fs/file/file_life/ops-help @@ -0,0 +1 @@ +fs: show life/name/age of file, deleted by who /\*显示文件生命周期,文件age,文件名字,被哪一个进程删除*/\ diff --git a/ops/interaction/fs/file/file_life/ops-run b/ops/interaction/fs/file/file_life/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..bf9116f27ade548ac3b9d38c9141fa60bea99fb3 --- /dev/null +++ b/ops/interaction/fs/file/file_life/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show life of file, file name, file age, deleted by who\033[0m\n" + + #------ read/write numbers and bytes at interval time ------ + if [ $# == 0 ]; then + /usr/share/bcc/tools/filelife + elif [ $# == 1 ]; then + /usr/share/bcc/tools/filelife -p $1 + fi +} +main $* diff --git a/ops/interaction/fs/file/ops-help b/ops/interaction/fs/file/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..e8800d409ae1c881955fd10c3e231569371cdea3 --- /dev/null +++ b/ops/interaction/fs/file/ops-help @@ -0,0 +1 @@ +fs: file tools /\*file相关的工具,包括文件的生命周期,正在读写的文件,文件读写byte top等*/\ diff --git a/ops/interaction/fs/file/read_which_file/ops-help b/ops/interaction/fs/file/read_which_file/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b472010a5cf4ae35d7f9b7565d284cf84b9fed04 --- /dev/null +++ b/ops/interaction/fs/file/read_which_file/ops-help @@ -0,0 +1 @@ +fs: show which file to read now /\*显示正在读什么文件*/\ diff --git a/ops/interaction/fs/file/read_which_file/ops-run b/ops/interaction/fs/file/read_which_file/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..98dc2f121a12b029a1be8cf9fba8c573f1aa9516 --- /dev/null +++ b/ops/interaction/fs/file/read_which_file/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show which file to read now\033[0m\n" + + #------ 正在读写哪一个文件 ------ + version=$(uname -r) + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -fg 5 -w 1 -s file -s path -s dentry -a f_path -a dentry -a d_iname \ + -i 3 -t char -f vfs_read -v /boot/vmlinux-$version +} +main $* diff --git a/ops/interaction/fs/file/sync_io_stat_perf_file/ops-help b/ops/interaction/fs/file/sync_io_stat_perf_file/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..ec99dbf721ea142be64b8418b107fcec45d1e823 --- /dev/null +++ b/ops/interaction/fs/file/sync_io_stat_perf_file/ops-help @@ -0,0 +1 @@ +fs: show latency and bytes of file sync io /\*显示file同步io的读写时延与byte数*/\ diff --git a/ops/interaction/fs/file/sync_io_stat_perf_file/ops-run b/ops/interaction/fs/file/sync_io_stat_perf_file/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..b6a432bda8b9b974d6618f7b3773675f0151e29e --- /dev/null +++ b/ops/interaction/fs/file/sync_io_stat_perf_file/ops-run @@ -0,0 +1,32 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show latency and bytes of file sync io, which latency > \$interval\033[0m\n" + + #------ read/write numbers and bytes at interval time ------ + printf "\033[32mneed: interval/[process]:oc-ops interaction fs sync_io_stat_perf_file \$interval [\$process]\033[0m\n" + if [ $# == 0 ]; then + printf "\033[32msee sync io latency > interval:10ms\033[0m\n" + /usr/share/bcc/tools/fileslower + elif [ $# == 1 ]; then + printf "\033[32msee sync io latency > interval:$1\033[0m\n" + /usr/share/bcc/tools/fileslower $1 + elif [ $# == 2 ]; then + printf "\033[32msee sync io latency > interval:$1, pid:$2\033[0m\n" + /usr/share/bcc/tools/fileslower $1 -p $2 + fi +} +main $* diff --git a/ops/interaction/fs/file/top_bytes_of_files/ops-help b/ops/interaction/fs/file/top_bytes_of_files/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c2fe82d1c7c196b04bd0e50ce48f922a6ab780f8 --- /dev/null +++ b/ops/interaction/fs/file/top_bytes_of_files/ops-help @@ -0,0 +1 @@ +fs: show files top of bytes /\*显示进程对文件读写bytes top排序*/\ diff --git a/ops/interaction/fs/file/top_bytes_of_files/ops-run b/ops/interaction/fs/file/top_bytes_of_files/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..71552bd2c4cd878dbf647e050f83873f461a2100 --- /dev/null +++ b/ops/interaction/fs/file/top_bytes_of_files/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show files top of bytes\033[0m\n" + + #------ file bytes top at interval time ------ + + printf "\033[32mtwo selected: [interval(default 10)]/[process]:oc-ops interaction fs sync_io_stat_perf_file [\$interval] [\$process]\033[0m\n" + sleep 5 + if [ $# -lt 1 ]; then + /usr/share/bcc/tools/filetop 10 -C + elif [ $# == 1 ]; then + /usr/share/bcc/tools/filetop $1 -C + elif [ $# == 2 ]; then + /usr/share/bcc/tools/filetop $1 -p $2 -C + fi +} +main $* diff --git a/ops/interaction/fs/file/write_which_file/ops-help b/ops/interaction/fs/file/write_which_file/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..5f2e40b38926b3c49145a06a062400852dfcab58 --- /dev/null +++ b/ops/interaction/fs/file/write_which_file/ops-help @@ -0,0 +1 @@ +fs: show which file to write now /\*显示正在写什么文件*/\ diff --git a/ops/interaction/fs/file/write_which_file/ops-run b/ops/interaction/fs/file/write_which_file/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..cc0ee79efb9d0f0968844142dcbd4956587d66fe --- /dev/null +++ b/ops/interaction/fs/file/write_which_file/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show which file to write now\033[0m\n" + + #------ 正在读写哪一个文件 ------ + version=$(uname -r) + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -fg 5 -w 1 -s file -s path -s dentry -a f_path -a dentry -a d_iname \ + -i 3 -t char -f vfs_write -v /boot/vmlinux-$version +} +main $* diff --git a/ops/interaction/fs/ops-help b/ops/interaction/fs/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..8d6d5365c558d627d4a82faa04b4688f17395ad7 --- /dev/null +++ b/ops/interaction/fs/ops-help @@ -0,0 +1 @@ +HCI: fs tools diff --git a/ops/interaction/fs/read_head/ops-help b/ops/interaction/fs/read_head/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..47ed251ed044fb26c89b49f94eb66ac3dbbdb940 --- /dev/null +++ b/ops/interaction/fs/read_head/ops-help @@ -0,0 +1 @@ +fs: show performance of read-ahead cache /\*显示fs预读性能*/\ diff --git a/ops/interaction/fs/read_head/ops-run b/ops/interaction/fs/read_head/ops-run new file mode 100644 index 0000000000000000000000000000000000000000..109bdd4803af4eab7f4225add663c3191c24e58f --- /dev/null +++ b/ops/interaction/fs/read_head/ops-run @@ -0,0 +1,27 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show performance of read-ahead cache\033[0m\n" + + #------ performance of read-ahead cache ------ + printf "\033[32mcan assign [\$interval], default 10 (s):oc-ops interaction fs readahead [\$inverval]\033[0m\n" + if [ $# == 1 ]; then + /usr/share/bcc/tools/readahead -d $1 + elif [ $# == 0 ]; then + /usr/share/bcc/tools/readahead -d 10 + fi +} +main $* diff --git a/ops/interaction/fs/readahead/ops-help b/ops/interaction/fs/readahead/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..47ed251ed044fb26c89b49f94eb66ac3dbbdb940 --- /dev/null +++ b/ops/interaction/fs/readahead/ops-help @@ -0,0 +1 @@ +fs: show performance of read-ahead cache /\*显示fs预读性能*/\ diff --git a/ops/interaction/fs/readahead/ops-run b/ops/interaction/fs/readahead/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..109bdd4803af4eab7f4225add663c3191c24e58f --- /dev/null +++ b/ops/interaction/fs/readahead/ops-run @@ -0,0 +1,27 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show performance of read-ahead cache\033[0m\n" + + #------ performance of read-ahead cache ------ + printf "\033[32mcan assign [\$interval], default 10 (s):oc-ops interaction fs readahead [\$inverval]\033[0m\n" + if [ $# == 1 ]; then + /usr/share/bcc/tools/readahead -d $1 + elif [ $# == 0 ]; then + /usr/share/bcc/tools/readahead -d 10 + fi +} +main $* diff --git a/ops/interaction/fs/xfs/ops-help b/ops/interaction/fs/xfs/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..8d6d5365c558d627d4a82faa04b4688f17395ad7 --- /dev/null +++ b/ops/interaction/fs/xfs/ops-help @@ -0,0 +1 @@ +HCI: fs tools diff --git a/ops/interaction/fs/xfs/xfs_operation_latency/ops-help b/ops/interaction/fs/xfs/xfs_operation_latency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..1a0374e1c3f73263aef175862a0d4a0b11da76cd --- /dev/null +++ b/ops/interaction/fs/xfs/xfs_operation_latency/ops-help @@ -0,0 +1 @@ +system: summarize XFS operation latency /\*统计xfs操作时延*/\ diff --git a/ops/interaction/fs/xfs/xfs_operation_latency/ops-run b/ops/interaction/fs/xfs/xfs_operation_latency/ops-run new file mode 100644 index 0000000000000000000000000000000000000000..1cb4e7c127d81377a93ce52594bcd5578cd7e837 --- /dev/null +++ b/ops/interaction/fs/xfs/xfs_operation_latency/ops-run @@ -0,0 +1,29 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:summarize XFS operation latency\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu xfsdist [\$pid]\033[0m\n" + #------ frequency ------ + if [ $# -lt 1 ]; then + printf "\033[32mdefault:summarize XFS operation latency\033[0m\n" + /usr/share/bcc/tools/xfsdist 6 + elif [ $# -eq 1 ]; then + printf "\033[32msummarize XFS operation latency for pid:$1\033[0m\n" + /usr/share/bcc/tools/xfsdist 6 -p $1 + fi + +} +main $* diff --git a/ops/interaction/fs/xfs/xfs_recover/ops-help b/ops/interaction/fs/xfs/xfs_recover/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..d715f87c755552367053d6e43d14f403308eb16e --- /dev/null +++ b/ops/interaction/fs/xfs/xfs_recover/ops-help @@ -0,0 +1 @@ +fs: xfs tools /\*xfs 修复工具*/\ diff --git a/ops/interaction/fs/xfs/xfs_recover/ops-run b/ops/interaction/fs/xfs/xfs_recover/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..cf20f559ff9c896e5310a82def319d6ecbec1242 --- /dev/null +++ b/ops/interaction/fs/xfs/xfs_recover/ops-run @@ -0,0 +1,37 @@ +#!/bin/bash +# -*- coding: utf-8 -*- + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + if [ $# -lt 1 ]; then + printf "\033[31m输入需要查询的能力:\033[0m\n" + printf "\033[32moc-ops interaction fs xfs help\033[0m\n" + printf "\033[32moc-ops interaction fs xfs recover xxx(device, such as /dev/sda) \* 修复xfs文件系统*\ \033[0m\n" + exit 1 + fi + + sub="$1" + #work + printf "\033[31m-------------find work: $sub\033[0m\n" + printf "\033[32mNormal:For detail():oc-ops fs xfs_recover -h\033[0m\n" + + #------ recover fs ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $# -lt 1 ]; then + $dir/oc-ops fs xfs_recover -h + exit 1 + fi + if [ $# -eq 2 ]; then + $dir/oc-ops fs xfs_recover -d $2 + else : + printf "\033[32moc-ops interaction fs xfs recover xxx(device, such as /dev/sda) \* 修复xfs文件系统*\ \033[0m\n" + $dir/oc-ops fs xfs_recover -h + fi +} +main $* diff --git a/ops/interaction/io/disk_performance/ops-help b/ops/interaction/io/disk_performance/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c888bc8a1dd3b7668444c559f9f5052b376e2f97 --- /dev/null +++ b/ops/interaction/io/disk_performance/ops-help @@ -0,0 +1 @@ +IO: disk io speed statistics /\*磁盘io读写性能*/\ \ diff --git a/ops/interaction/io/disk_performance/ops-run b/ops/interaction/io/disk_performance/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..59aed6cefb6d7af681673618b440000d451d108a --- /dev/null +++ b/ops/interaction/io/disk_performance/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: disk performance\033[0m\n" + + #------ disk ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops io s_iostat $* +} +main $* diff --git a/ops/interaction/io/disk_statistics/ops-help b/ops/interaction/io/disk_statistics/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..282214e7bbd9bd369dacb02c9f8fca18a402ce7f --- /dev/null +++ b/ops/interaction/io/disk_statistics/ops-help @@ -0,0 +1 @@ +IO: disk status statistics, read/write/io /\*磁盘统计*/\ diff --git a/ops/interaction/io/disk_statistics/ops-run b/ops/interaction/io/disk_statistics/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ac057c939625f72b7a7c5642afd59aa649ee7ffc --- /dev/null +++ b/ops/interaction/io/disk_statistics/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: disk statistics\033[0m\n" + + #------ disk ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops mem vmstat -d +} +main $* diff --git a/ops/interaction/io/disk_statistics_sum/ops-help b/ops/interaction/io/disk_statistics_sum/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..5a712dfb8b05f827222bd458fa88e4e068b425cb --- /dev/null +++ b/ops/interaction/io/disk_statistics_sum/ops-help @@ -0,0 +1 @@ +IO: disk status sum statistics /*磁盘io统计和*/ diff --git a/ops/interaction/io/disk_statistics_sum/ops-run b/ops/interaction/io/disk_statistics_sum/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..41168b7a128d84f8fe545c6acca6e92383215b8b --- /dev/null +++ b/ops/interaction/io/disk_statistics_sum/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: disk statistics sum\033[0m\n" + + #------ disk ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops mem vmstat -D +} +main $* diff --git a/ops/interaction/io/ops-help b/ops/interaction/io/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..37acaf853d3acd21cd025436a18e905a9f980854 --- /dev/null +++ b/ops/interaction/io/ops-help @@ -0,0 +1 @@ +HCI: show io status, such as disk, partition .etc diff --git a/ops/interaction/io/partition_statistics/ops-help b/ops/interaction/io/partition_statistics/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..e861127c5a59b33685f7404cbb3f6a090833d542 --- /dev/null +++ b/ops/interaction/io/partition_statistics/ops-help @@ -0,0 +1 @@ +IO: partition status statistics /\*分区统计*/\ diff --git a/ops/interaction/io/partition_statistics/ops-run b/ops/interaction/io/partition_statistics/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..5edf62d2c5b6b4398c3df30f4ede062f7ed041f9 --- /dev/null +++ b/ops/interaction/io/partition_statistics/ops-run @@ -0,0 +1,28 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: partition status statistics\033[0m\n" + + #------ disk ------ + if [ $# -lt 1 ]; then + printf "\033[32mneed \$dev: oc-ops interaction io patition_statistics \$dev /\*分区统计*/\ \033[0m\n" + exit 1 + fi + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops mem vmstat -p $1 +} +main $* diff --git a/ops/interaction/io/scene_io/bfq_stat/ops-help b/ops/interaction/io/scene_io/bfq_stat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..d0c63b54a720c9cafdb30a12b2378b57686c57a1 --- /dev/null +++ b/ops/interaction/io/scene_io/bfq_stat/ops-help @@ -0,0 +1 @@ +IO: see bfq bio and req process latency /\*查看bio从提交入队到派发,从派发到endbio,及ata/work处理时延*/\ diff --git a/ops/interaction/io/scene_io/bfq_stat/ops-run b/ops/interaction/io/scene_io/bfq_stat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..a34cbc4ca93855e3929563d467d799aa94a3bbb1 --- /dev/null +++ b/ops/interaction/io/scene_io/bfq_stat/ops-run @@ -0,0 +1,43 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: see bfq bio and req process latency\033[0m\n" + printf "\033[35mUses: t-ops interaction io scene_io bfq_stat [disk]\033[0m\n" + printf "\033[35mTwo parameters, if need, manual entry: [disk], default all disk\033[0m\n" + printf "\033[35mSuch as:t-ops interaction io scene_io bfq_stat vda\033[0m\n": + printf "\033[35mNote: may more print log\033[0m\n" + + disk="" + delay=5 + while [[ $# -gt 0 ]]; do + echo "$1" | [ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=$1 + if [ $has_delay != $delay ] && [ $has_delay != 0 ]; then + delay=$has_delay + shift # past value + continue + fi + disk=$1 + shift # past value + done + #------ disk ------ + if [ ! -z "$disk" ]; then + t-ops os_stat -f bfq_insert_requests -1 bfq_dispatch_request -2 ata_qc_complete -3 ata_qc_complete_internal -4 ata_qc_issue -5 blk_mq_run_work_fn -6 scsi_mq_get_budget -7 scsi_mq_get_budget -8 mod_delayed_work_on -9 $disk -n 8 -fg 10 -sc 67 -de $delay + else : + t-ops os_stat -f bfq_insert_requests -1 bfq_dispatch_request -2 ata_qc_complete -3 ata_qc_complete_internal -4 ata_qc_issue -5 blk_mq_run_work_fn -6 scsi_mq_get_budget -7 scsi_mq_get_budget -8 mod_delayed_work_on -n 8 -fg 10 -sc 67 -de $delay + fi + printf "\033[32mdata log: /var/log/t-ops/os_stat.log\033[0m\n" +} +main $* diff --git a/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-help b/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..8b8b2e9f99a6205f03dfbc067ee2a940b0589918 --- /dev/null +++ b/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-help @@ -0,0 +1 @@ +IO: show realtime change ofdirty block numbers /\*查看dirty block的数量变化情况, 大于free的一半时,会flush每次的写*/\ diff --git a/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-run b/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..58da99ac50c258b5ef7c7a24b0cf5f2af9525fba --- /dev/null +++ b/ops/interaction/io/scene_io/find_inode_reserve_blocks/ops-run @@ -0,0 +1,80 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:show realtime change of dirty block numbers\033[0m\n" + printf "\033[35mUses: t-ops interaction io scene_io find_inode_reserve_blocks [disk] [type]\033[0m\n" + printf "\033[35mParameters:[disk]: default all disk; [type]: list(default), scan inode hashtable list; proc, scan files opened by proc;\033[0m\n" + printf "\033[35mSuch as:t-ops interaction io scene_io find_inode_reserve_blocks [vdb]\033[0m\n" + + disk="" + scan_type=71 + while [[ $# -gt 0 ]]; do + if [ $(strstr "proc" $1) ] || [ $(strstr "list" $1) ]; then + scan_type=70 + shift # past value + continue + fi + disk=$1 + shift # past value + done + + if [ ! -z "$disk" ]; then + t-ops os_stat -f vfs_read -1 ext4_da_reserve_space -2 tmp -3 $disk -4 inode_hashtable -5 inode_hash_lock -6 ihash_entries -7 nr_kernel_pages -8 i_hash_shift -n 2 -fg 10 -sc $scan_type -de 100000 & + else : + t-ops os_stat -f vfs_read -n 1 -fg 10 -sc 68 -de 100000 & + fi + + sleep 10 + t-ops os_stat -f vfs_read -n 1 -fg 12 & + sleep 1 + + size=`expr 1 \* 1024` + size=`expr $size \* 1024` + i=0 + subdir=$(dirname $0) + dir=${subdir%ops*} + dir="/var/log/t-ops/" + touch $dir/os_stat_total.log + touch $dir/os_stat.log + while true; do + t-ops os_stat -f vfs_read -n 1 -fg 11 + filesize=$(ls -l "$dir/os_stat_total.log" | awk '{print $5}') + if [ $filesize -ge $size ]; then + mv $dir/os_stat_total.log $dir/os_stat_total.log_$i.txt + touch $dir/os_stat_total.log + i=$(($i + 1)) + fi + filesize=$(ls -l "$dir/os_stat.log" | awk '{print $5}') + if [ $filesize -ge $size ]; then + mv $dir/os_stat.log $dir/os_stat.log_$i.txt + touch $dir/os_stat.log + i=$(($i + 1)) + fi + pid=$(pgrep "os_stat_test") + if [ ! -n "$pid" ]; then + pid=$(pgrep "long") + kill -9 $pid + pid=$(pgrep "ops-run") + kill -9 $pid + pid=$(pgrep "scene_io") + kill -9 $pid + sleep 1 + rmmod os_aware + break + fi + done +} +main $* diff --git a/ops/interaction/io/scene_io/freeblock_is_enough_ornot/ops-run b/ops/interaction/io/scene_io/freeblock_is_enough_ornot/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..da54d870d8f690af90910b7a5b1165794b0b12d5 --- /dev/null +++ b/ops/interaction/io/scene_io/freeblock_is_enough_ornot/ops-run @@ -0,0 +1,51 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show freeblock is enough ornot\033[0m\n" + printf "\033[35mUses: t-ops interaction io scene_io freeblock_is_enough_ornot [fs] [disk]\033[0m\n" + printf "\033[35mTwo parameters, if need, manual entry: [fs] [disk], default ext4, all disk\033[0m\n" + printf "\033[35mSuch as:t-ops interaction io scene_io freeblock_is_enough_ornot ext4 vdb\033[0m\n" + printf "\033[35mNote: may more print log\033[0m\n" + + delay=5 + + fs="ext4" + disk="" + has_delay=0 + while [[ $# -gt 0 ]]; do + if [ $(strstr ext4 $1) ] || [ $(strstr xfs $1) ]; then + fs=$1 + shift # past value + continue + fi + echo "$1" | [ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=$1 + if [ $has_delay != $delay ] && [ $has_delay != 0 ]; then + delay=$has_delay + shift # past value + continue + fi + disk=$1 + shift # past value + done + #------ disk ------ + if [ ! -z "$disk" ]; then + t-ops os_stat -fg 10 -f ext4_nonda_switch -1 ext4_file_read_iter -2 $fs -3 $disk -n 2 -sc 67 -de $delay + else : + t-ops os_stat -fg 10 -f ext4_nonda_switch -1 ext4_file_read_iter -2 $fs -n 2 -sc 67 -de $delay + fi + printf "\033[32mdata log: /var/log/t-ops/os_stat.log\033[0m\n" +} +main $* diff --git a/ops/interaction/io/scene_io/stat_block_uses/ops-run b/ops/interaction/io/scene_io/stat_block_uses/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..7cfb3d4ab674821a14b25f4e16767543e49f1cf8 --- /dev/null +++ b/ops/interaction/io/scene_io/stat_block_uses/ops-run @@ -0,0 +1,49 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:show realtime change ofdirty block numbers\033[0m\n" + printf "\033[35mUses: t-ops interaction io scene_io stat_block_uses [fs] [disk] [delay]\033[0m\n" + printf "\033[35mTwo parameters, if need, manual entry: [fs] [disk], default ext4, all disk\033[0m\n" + printf "\033[35mSuch as:t-ops interaction io scene_io stat_block_uses ext4 vdb 10\033[0m\n" + printf "\033[35mNote: may more print log\033[0m\n" + + fs="ext4" + disk="" + delay=1 + has_delay=0 + while [[ $# -gt 0 ]]; do + if [ $(strstr ext4 $1) ] || [ $(strstr xfs $1) ]; then + fs=$1 + shift # past value + continue + fi + echo "$1" | [ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=$1 + if [ $has_delay != $delay ] && [ $has_delay != 0 ]; then + delay=$has_delay + shift # past value + continue + fi + disk=$1 + shift # past value + done + #------ disk ------ + if [ ! -z "$disk" ]; then + t-ops os_stat -f ext4_da_update_reserve_space -1 ext4_mb_new_blocks -2 ext4_da_release_space -3 ext4_da_reserve_space -4 ext4_rereserve_cluster -5 ext4_clear_inode -6 ext4_evict_inode -7 $fs -8 $disk -n 7 -fg 10 -sc 67 -de $delay + else : + t-ops os_stat -f ext4_da_update_reserve_space -1 ext4_claim_free_clusters -2 ext4_da_release_space -3 ext4_da_reserve_space -4 ext4_rereserve_cluster -5 $fs -n 5 -fg 10 -sc 67 -de $delay + fi +} +main $* diff --git a/ops/interaction/mem/enter_memory_leak/ops-help b/ops/interaction/mem/enter_memory_leak/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b1221a9af91a6150f87f726ee3138a99bc00fa4c --- /dev/null +++ b/ops/interaction/mem/enter_memory_leak/ops-help @@ -0,0 +1 @@ +HCI: memory leak tools /\*内存泄漏*/\ \ diff --git a/ops/interaction/mem/enter_memory_leak/ops-run b/ops/interaction/mem/enter_memory_leak/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..fa2c7bdacfdc0c3747fdb5180a44a1fc098e15dd --- /dev/null +++ b/ops/interaction/mem/enter_memory_leak/ops-run @@ -0,0 +1,29 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: memory leak\033[0m\n" + printf "\033[32moc-ops interaction mem memory_leak \*内存泄漏*\ \033[0m\n" + + #------ memleak ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $# -le 0 ]; then + $dir/oc-ops mem enter_memleak 10 1 + else : + $dir/oc-ops mem enter_memleak $@ + fi +} +main $* diff --git a/ops/interaction/mem/oom_trace/ops-help b/ops/interaction/mem/oom_trace/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..3e1919ad63604cf9e3a5c36915b8ff147bf4f996 --- /dev/null +++ b/ops/interaction/mem/oom_trace/ops-help @@ -0,0 +1 @@ +mem: traces oom killer, prints basic details /\*oom时详细的信息:被哪个进程触发,当时负载等*/\ diff --git a/ops/interaction/mem/oom_trace/ops-run b/ops/interaction/mem/oom_trace/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..7c5af05afb8ebd8bc21fa1683e8e835bd3542e0f --- /dev/null +++ b/ops/interaction/mem/oom_trace/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: traces oom killer, prints basic details\033[0m\n" + + #------ show more info about oom ------ + /usr/share/bcc/tools/oomkill +} +main $* diff --git a/ops/interaction/mem/ops-help b/ops/interaction/mem/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..e9d95117054fa29d88acb6d00a271ee36af7801a --- /dev/null +++ b/ops/interaction/mem/ops-help @@ -0,0 +1 @@ +HCI: mem tools, such as memleak, memstrack, vmstat .etc. see oc-ops interaction mem diff --git a/ops/interaction/mem/scan_memory_allocate/ops-help b/ops/interaction/mem/scan_memory_allocate/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a1b622624f669a5475d6e51fcecd5e6945e680c7 --- /dev/null +++ b/ops/interaction/mem/scan_memory_allocate/ops-help @@ -0,0 +1 @@ +mem: scan memory allcation /\*扫描内存分配情况,统计分配1~1024个page的情况*/\ \ diff --git a/ops/interaction/mem/scan_memory_allocate/ops-run b/ops/interaction/mem/scan_memory_allocate/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..10f629c427780aeabf2ee87b98b09308c25efc5b --- /dev/null +++ b/ops/interaction/mem/scan_memory_allocate/ops-run @@ -0,0 +1,27 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: scan memory allcation\033[0m\n" + printf "\033[32mt-ops interaction mem scan_memory_allocate [delay] \033[0m\n" + + args=$# + delay=5 + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && delay=${!args} + + #------ memleak ------ + t-ops os_stat -f __alloc_pages_nodemask -1 free_one_page -n 2 -fg 10 -sc 72 -de $delay +} +main $* diff --git a/ops/interaction/mem/slub_status/ops-help b/ops/interaction/mem/slub_status/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..529e583bdacb6cdaac37d7ff85529839d7344690 --- /dev/null +++ b/ops/interaction/mem/slub_status/ops-help @@ -0,0 +1 @@ +HCI: slub tools /\*slub状态*/\ \ diff --git a/ops/interaction/mem/slub_status/ops-run b/ops/interaction/mem/slub_status/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ae06abd79dfea39217660ca05f70f5503596c9ac --- /dev/null +++ b/ops/interaction/mem/slub_status/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: slub status\033[0m\n" + printf "\033[32moc-ops interaction mem slub_status \*slub状态*\ \033[0m\n" + + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops mem vmstat -m +} +main $* diff --git a/ops/interaction/mem/virtual_memory_status/ops-help b/ops/interaction/mem/virtual_memory_status/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..0545a1d0fa5d71606c71bab23d50d15db8e4aac3 --- /dev/null +++ b/ops/interaction/mem/virtual_memory_status/ops-help @@ -0,0 +1 @@ +HCI: virtual memory tools /\*虚拟内存状态*/\ \ diff --git a/ops/interaction/mem/virtual_memory_status/ops-run b/ops/interaction/mem/virtual_memory_status/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..a0f1abfec9d006e430ebc1f84b8bf89a179a8ed2 --- /dev/null +++ b/ops/interaction/mem/virtual_memory_status/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: virtual memory status\033[0m\n" + printf "\033[32moc-ops interaction mem virtual_memory_status \*虚拟内存状态*\ \033[0m\n" + + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops mem vmstat -a +} +main $* diff --git a/ops/interaction/net/all_port/ops-help b/ops/interaction/net/all_port/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..4b89277206f1ad0673a94f485de040b174df0c6a --- /dev/null +++ b/ops/interaction/net/all_port/ops-help @@ -0,0 +1 @@ +net: show all ports /\*所有端口*/\ \ diff --git a/ops/interaction/net/all_port/ops-run b/ops/interaction/net/all_port/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..69835a33e482f2dd9b5b7f96ce90777c83062665 --- /dev/null +++ b/ops/interaction/net/all_port/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show all ports\033[0m\n" + + #------ network ------ + netstat -a +} +main $* diff --git a/ops/interaction/net/bind_ip_mac/ops-help b/ops/interaction/net/bind_ip_mac/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..eb2e57dcb8cd9be026b2d2ce28b60774fb2ad53c --- /dev/null +++ b/ops/interaction/net/bind_ip_mac/ops-help @@ -0,0 +1 @@ +net: bind ip and mac /\*绑定ip和mac*/\ diff --git a/ops/interaction/net/bind_ip_mac/ops-run b/ops/interaction/net/bind_ip_mac/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d5204dded737077e70a6d9e135a1762d92546511 --- /dev/null +++ b/ops/interaction/net/bind_ip_mac/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: bind ip and mac\033[0m\n" + + if [ $# -lt 2 ]; then + printf "\033[32mneed ip and mac:oc-ops interaction net bind_ip_mac \$ip \$mac\033[0m\n" + exit 1 + fi + arp -s $ip $mac + +} +main $* diff --git a/ops/interaction/net/capture_net_packet/ops-help b/ops/interaction/net/capture_net_packet/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..ea882e7f825ac73cd6784e14275a796ae206126d --- /dev/null +++ b/ops/interaction/net/capture_net_packet/ops-help @@ -0,0 +1 @@ +net: capture network packet /\*网络抓包,可指定过滤filter,例如只看80端口*/\ diff --git a/ops/interaction/net/capture_net_packet/ops-run b/ops/interaction/net/capture_net_packet/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..035bd663115158b1f0e3331dd51ee45a850e6c1f --- /dev/null +++ b/ops/interaction/net/capture_net_packet/ops-run @@ -0,0 +1,32 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: capture network packet\033[0m\n" + + #------ network ------ + if [ $# -lt 1 ]; then + printf "\033[32moc-ops interaction net capture_net_packet \$port [\$filter]\033[0m\n" + printf "\033[32msuch as, catch all : oc-ops interaction net capture_net_packet eth0\033[0m\n" + printf "\033[32msuch as, only catch 80 port: oc-ops interaction net capture_net_packet eth0 80\033[0m\n" + exit 1 + fi + if [ $# -eq 2 ]; then + tcpdump -i $1 -nn -s 120 -v port $2 + else : + tcpdump -i $1 -nn -s 120 -v + fi +} +main $* diff --git a/ops/interaction/net/concurrent_connections/ops-help b/ops/interaction/net/concurrent_connections/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c815041cd49ab658a9007165283ac54854558f2d --- /dev/null +++ b/ops/interaction/net/concurrent_connections/ops-help @@ -0,0 +1 @@ +net: show num of concurrent connections /\*并发连接数*/\ \ diff --git a/ops/interaction/net/concurrent_connections/ops-run b/ops/interaction/net/concurrent_connections/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..bac4b5180d6af806311057bbadb4b4e98932689a --- /dev/null +++ b/ops/interaction/net/concurrent_connections/ops-run @@ -0,0 +1,23 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show num of concurrent connections \033[0m\n" + + #------ network ------ + ss -o state established | wc -l + +} +main $* diff --git a/ops/interaction/net/delete_arp_item/ops-help b/ops/interaction/net/delete_arp_item/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..e28c2b4de8c2d60d325b55f6c33b4110854e5166 --- /dev/null +++ b/ops/interaction/net/delete_arp_item/ops-help @@ -0,0 +1 @@ +net: delete arp cache item /\*删除指定ip的arp缓存项*/\ diff --git a/ops/interaction/net/delete_arp_item/ops-run b/ops/interaction/net/delete_arp_item/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..cfacf074cf4fa93cfbbc31a21ee7304e5734b396 --- /dev/null +++ b/ops/interaction/net/delete_arp_item/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: delete arp item assigned by ip\033[0m\n" + + #------ network ------ + if [ $# -lt 1 ]; then + printf "\033[32mneed ip:oc-ops interaction net delete_arp_item \$ip /\*删除指定ip的arp缓存项*/\ \033[0m\n" + exit 1 + fi + arp -d $ip +} +main $* diff --git a/ops/interaction/net/delete_arp_port_item/ops-help b/ops/interaction/net/delete_arp_port_item/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..8b9c4d05fb8e445dbdd4929cecbbc9f796882b61 --- /dev/null +++ b/ops/interaction/net/delete_arp_port_item/ops-help @@ -0,0 +1 @@ +net: delete arp cache item assigned by port and ip /\*删除指定网卡的arp缓存项*/\ diff --git a/ops/interaction/net/delete_arp_port_item/ops-run b/ops/interaction/net/delete_arp_port_item/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..189e2aef128fe708c1843c47f982983b5239b2d2 --- /dev/null +++ b/ops/interaction/net/delete_arp_port_item/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: delete arp cache item assigned by port and ip\033[0m\n" + + #------ network ------ + if [ $# -lt 2 ]; then + printf "\033[32mneed port(such as eth0) and ip:oc-ops interaction net delete_arp_port_item \$port \$ip /\*删除指定网卡的arp缓存项*/\033[0m\n" + exit 1 + fi + arp i $1 -d $2 +} +main $* diff --git a/ops/interaction/net/detail_arp_cache_items/ops-help b/ops/interaction/net/detail_arp_cache_items/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..375204311f821bf80b54ee818171877903b4b99c --- /dev/null +++ b/ops/interaction/net/detail_arp_cache_items/ops-help @@ -0,0 +1 @@ +net: show detail arp cache item /\*详细的arp缓存条目*/\ diff --git a/ops/interaction/net/detail_arp_cache_items/ops-run b/ops/interaction/net/detail_arp_cache_items/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..1c69449bf48b805c1efb6c48183f3041d8ca2cd4 --- /dev/null +++ b/ops/interaction/net/detail_arp_cache_items/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show detail arp cache item\033[0m\n" + + #------ network ------ + arp -v +} +main $* diff --git a/ops/interaction/net/iface_interface_table/ops-help b/ops/interaction/net/iface_interface_table/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..f08cd991e247f7284e7935ce66b7c41d67c14b62 --- /dev/null +++ b/ops/interaction/net/iface_interface_table/ops-help @@ -0,0 +1 @@ +net: show Iface interface table /\*显示Iface接口表*/\ diff --git a/ops/interaction/net/iface_interface_table/ops-run b/ops/interaction/net/iface_interface_table/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..383f1bfe708b459477325de2e32801efccfac379 --- /dev/null +++ b/ops/interaction/net/iface_interface_table/ops-run @@ -0,0 +1,27 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show Iface interface table\033[0m\n" + + #------ network ------ + if [ $# -lt 1 ]; then + printf "\033[32mneed iface: oc-ops interaction net Iface_interface_table \$iface\033[0m\n" + exit 1 + fi + netstat -I $1 -i + +} +main $* diff --git a/ops/interaction/net/interface_table/ops-help b/ops/interaction/net/interface_table/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..462057abaee2a4040ae529c3bf923d2c3fc8b620 --- /dev/null +++ b/ops/interaction/net/interface_table/ops-help @@ -0,0 +1 @@ +net: show interface table /\*显示接口表 */\ diff --git a/ops/interaction/net/interface_table/ops-run b/ops/interaction/net/interface_table/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..af799e3f4356e50da6b82dbbd877a6589c45ae5b --- /dev/null +++ b/ops/interaction/net/interface_table/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show interface table\033[0m\n" + + #------ network ------ + netstat -i +} +main $* diff --git a/ops/interaction/net/monitor_route_table/ops-help b/ops/interaction/net/monitor_route_table/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..9a7c3d62ee2e949351c8fa38396eb907c24c5d46 --- /dev/null +++ b/ops/interaction/net/monitor_route_table/ops-help @@ -0,0 +1 @@ +net: monitor route table /\*监控路由表*/\ diff --git a/ops/interaction/net/monitor_route_table/ops-run b/ops/interaction/net/monitor_route_table/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..08ff6f23767fd6b7a10469e983239b6f467ce96c --- /dev/null +++ b/ops/interaction/net/monitor_route_table/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: monitor route table\033[0m\n" + + #------ network ------ + ip monitor all +} +main $* diff --git a/ops/interaction/net/network_bandwidth/ops-help b/ops/interaction/net/network_bandwidth/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..09499de16db62a5fe3b3fcb37c59d1cf8c1a3eb7 --- /dev/null +++ b/ops/interaction/net/network_bandwidth/ops-help @@ -0,0 +1 @@ +net: show network bandwidth /\*显示网络带宽*/\ diff --git a/ops/interaction/net/network_bandwidth/ops-run b/ops/interaction/net/network_bandwidth/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..a57365947741fba490e6f843af7b06a3b5fb67b9 --- /dev/null +++ b/ops/interaction/net/network_bandwidth/ops-run @@ -0,0 +1,26 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show network bandwidth\033[0m\n" + + #------ network ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + + $dir/oc-ops net iftop + +} +main $* diff --git a/ops/interaction/net/ops-help b/ops/interaction/net/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a7ef9367606b6467d2b64110cc719cac3eb2eb66 --- /dev/null +++ b/ops/interaction/net/ops-help @@ -0,0 +1 @@ +HCI: net tools diff --git a/ops/interaction/net/port_of_proc/ops-help b/ops/interaction/net/port_of_proc/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b48332a5615d793448ec83c59f48d5e3d926c433 --- /dev/null +++ b/ops/interaction/net/port_of_proc/ops-help @@ -0,0 +1 @@ +net: show port of process /\*查看程序端口*/\ diff --git a/ops/interaction/net/port_of_proc/ops-run b/ops/interaction/net/port_of_proc/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..c7e28bbb5c8fb696c3d14594bb6bf5961cebe39c --- /dev/null +++ b/ops/interaction/net/port_of_proc/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show port of process\033[0m\n" + + #------ network ------ + if [ $# -lt 1 ]; then + printf "\033[32mneed process name: oc-ops interaction net port_of_proc \$name\033[0m\n" + exit 1 + fi + result=$(netstat -ap | grep $1) + if [ -z "$result" ]; then + printf "\033[32m[$1] has no interface\033[0m\n" + else : + printf "\033[32m$result\033[0m\n" + fi +} +main $* diff --git a/ops/interaction/net/proc_of_port/ops-help b/ops/interaction/net/proc_of_port/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..72c6352f89d9de91862eafaae1c04d1e7461a71b --- /dev/null +++ b/ops/interaction/net/proc_of_port/ops-help @@ -0,0 +1 @@ +net: show processes who use the port /\*使用此端口的程序*/\ diff --git a/ops/interaction/net/proc_of_port/ops-run b/ops/interaction/net/proc_of_port/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..640c92cc7f6cf9f51807f861bd596f4645ae70cf --- /dev/null +++ b/ops/interaction/net/proc_of_port/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show processes who use the port\033[0m\n" + + #------ network ------ + if [ $# -lt 1 ]; then + printf "\033[32mneed \$port: oc-ops interaction net proc_of_port \$port\033[0m\n" + exit 1 + fi + result=$(netstat -ap | grep ":$1") + if [ -z "$result" ]; then + printf "\033[32minterface[$1] is not exist\033[0m\n" + else : + printf "\033[32m$result\033[0m\n" + fi +} +main $* diff --git a/ops/interaction/net/route_table/ops-help b/ops/interaction/net/route_table/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..15baf6a23db0a6eb6a45e544614f40cf6fd06fb4 --- /dev/null +++ b/ops/interaction/net/route_table/ops-help @@ -0,0 +1 @@ +net: show route table /\*显示路由表*/\ diff --git a/ops/interaction/net/route_table/ops-run b/ops/interaction/net/route_table/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..cbf728d789ede697b47dd22b5b0d15013f26a78c --- /dev/null +++ b/ops/interaction/net/route_table/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show route table\033[0m\n" + + #------ network ------ + netstat -r +} +main $* diff --git a/ops/interaction/net/rx_to_epoll/ops-help b/ops/interaction/net/rx_to_epoll/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..4826700a5caede0c43e4c477aedc1b61ea4623be --- /dev/null +++ b/ops/interaction/net/rx_to_epoll/ops-help @@ -0,0 +1 @@ +net: latency from packet received to receiver waked up /\*软中断收到包,到阻塞在epoll的进程被唤醒执行之间时延*/\ diff --git a/ops/interaction/net/rx_to_epoll/ops-run b/ops/interaction/net/rx_to_epoll/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..c9dda01b05e45d2833eea71da89595e81a4b7b8d --- /dev/null +++ b/ops/interaction/net/rx_to_epoll/ops-run @@ -0,0 +1,29 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: latency from packet received to receiver waked up\033[0m\n" + printf "\033[32m-------------uses:oc-ops interaction net rx_to_epoll [delay], defalut 5s\033[0m\n" + + #------ network ------ + args=$# + delay=5 + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && delay=${!args} + + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -f net_rx_action -t do_epoll_wait -s 100 -fg 8 -de $delay -o 0 +} +main $* diff --git a/ops/interaction/net/scene_net/get_data.py b/ops/interaction/net/scene_net/get_data.py new file mode 100755 index 0000000000000000000000000000000000000000..e26641050c797eaca0b079119c426d49e706093d --- /dev/null +++ b/ops/interaction/net/scene_net/get_data.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _func): + num=0 + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip not ave lines + if _ll.find(':' + _func) > 0: + i+=1 + if i < num: + continue + if _ll.find("num:") <= 0: + continue + + print(_ll) + + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _func = sys.argv[2]; + + _kv1 = parse_conf(_f1, _func); diff --git a/ops/interaction/net/scene_net/ops-help b/ops/interaction/net/scene_net/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..ef727762401b2fecd2197a034a94ec16802c8315 --- /dev/null +++ b/ops/interaction/net/scene_net/ops-help @@ -0,0 +1 @@ +net: refcount of net or sk /\*查看net/sk结构体refcount计数*/\ diff --git a/ops/interaction/net/scene_net/realtime_refcount/ops-help b/ops/interaction/net/scene_net/realtime_refcount/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..2f9a36aaad36bdc075b25e68697885a42500352e --- /dev/null +++ b/ops/interaction/net/scene_net/realtime_refcount/ops-help @@ -0,0 +1 @@ +net: show net refcount's realtime change /\*查看net结构体refcount计数*/\ diff --git a/ops/interaction/net/scene_net/realtime_refcount/ops-run b/ops/interaction/net/scene_net/realtime_refcount/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..fc07d24db6160b690e3244eecce04bdf7304c0d9 --- /dev/null +++ b/ops/interaction/net/scene_net/realtime_refcount/ops-run @@ -0,0 +1,28 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:show realtime change of net refcount\033[0m\n" + printf "\033[35mUses: t-ops interaction net scene_net [delay]\033[0m\n" + printf "\033[35mSuch as:t-ops interaction net scene_net 10\033[0m\n" + + delay=5 + args=$# + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && delay=${!args} + + #------ disk ------ + t-ops os_stat -f copy_net_ns -1 setup_net -2 net_alloc_generic -3 inc_ucount -4 dec_ucount -5 net_drop_ns -6 key_remove_domain -7 deferred_put_nlk_sk -8 sk_alloc -9 __sk_destruct -10 sk_clone_lock -11 __put_net -12 pernet_ops_rwsem -13 net_cachep -n 12 -fg 10 -sc 69 -de $delay +} +main $* diff --git a/ops/interaction/net/scene_net/slub_scan/ops-help b/ops/interaction/net/scene_net/slub_scan/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..2cacf571f2bba343d88c9811e48b301775aea393 --- /dev/null +++ b/ops/interaction/net/scene_net/slub_scan/ops-help @@ -0,0 +1 @@ +net: show sk refcount bases at slub /\*通过sturct sock sk结构占用的slub来查看所有sk引用计数*/\ diff --git a/ops/interaction/net/scene_net/slub_scan/ops-run b/ops/interaction/net/scene_net/slub_scan/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..3848b5810955f31ce90942cf7ad8c2e041ef787d --- /dev/null +++ b/ops/interaction/net/scene_net/slub_scan/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:show sk refcount bases at slub\033[0m\n" + printf "\033[35mUses: t-ops interaction net scene_net slub_scan [delay]\033[0m\n" + printf "\033[35mSuch as:t-ops interaction net scene_net slub_scan 10\033[0m\n" + + #------ disk ------sc : SIGRTMAX + 5 + t-ops os_stat -n 0 -fg 13 -sc 71 +} +main $* diff --git a/ops/interaction/net/service_listen_status/ops-help b/ops/interaction/net/service_listen_status/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..508c2cd7060448df4c87e5714ccbbefa86ea8864 --- /dev/null +++ b/ops/interaction/net/service_listen_status/ops-help @@ -0,0 +1 @@ +net: show service listening status /\*显示服务监听状态*/\ diff --git a/ops/interaction/net/service_listen_status/ops-run b/ops/interaction/net/service_listen_status/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ddf8bf01b949465de82e8b0cdad92f3a9c03f081 --- /dev/null +++ b/ops/interaction/net/service_listen_status/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show service listening status\033[0m\n" + + #------ network ------ + netstat -ltp +} +main $* diff --git a/ops/interaction/net/show_HTTP_connections/ops-help b/ops/interaction/net/show_HTTP_connections/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c983464ef5d6aabf163238705c7a39c77f6614c6 --- /dev/null +++ b/ops/interaction/net/show_HTTP_connections/ops-help @@ -0,0 +1 @@ +net: show HTTP connections /\*显示HTTP连接*/\ diff --git a/ops/interaction/net/show_HTTP_connections/ops-run b/ops/interaction/net/show_HTTP_connections/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..5d17c8a5f73c5d940e2232a3cae5ce2179c65962 --- /dev/null +++ b/ops/interaction/net/show_HTTP_connections/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show HTTP connections\033[0m\n" + + #------ network ------ + ss -o state established '( dport = :http or sport = :http )' +} +main $* diff --git a/ops/interaction/net/show_SMTP_connections/ops-help b/ops/interaction/net/show_SMTP_connections/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..9ece26eb5421d02dc522b261591470cd99505f09 --- /dev/null +++ b/ops/interaction/net/show_SMTP_connections/ops-help @@ -0,0 +1 @@ +net: show SMTP connections /\*显示SMTP连接*/\ diff --git a/ops/interaction/net/show_SMTP_connections/ops-run b/ops/interaction/net/show_SMTP_connections/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..315b26017a08231e5474363ac4cbc577bfdc1567 --- /dev/null +++ b/ops/interaction/net/show_SMTP_connections/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show SMTP connections\033[0m\n" + + #------ network ------ + ss -o state established '( dport = :smtp or sport = :smtp )' +} +main $* diff --git a/ops/interaction/net/show_arp_cache/ops-help b/ops/interaction/net/show_arp_cache/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..140088103bcef3485de5f2443fb9b35f363ce38a --- /dev/null +++ b/ops/interaction/net/show_arp_cache/ops-help @@ -0,0 +1 @@ +net: show arp cache table /\*查看arp缓存表*/\ diff --git a/ops/interaction/net/show_arp_cache/ops-run b/ops/interaction/net/show_arp_cache/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..98f50d1af1d25aa039929561f1ecf96c94406887 --- /dev/null +++ b/ops/interaction/net/show_arp_cache/ops-run @@ -0,0 +1,23 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show arp cache table\033[0m\n" + + #------ network ------ + arp + +} +main $* diff --git a/ops/interaction/net/show_socket_detail/ops-help b/ops/interaction/net/show_socket_detail/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..cb1714767b37ae7386fd986c61dc0503f493038e --- /dev/null +++ b/ops/interaction/net/show_socket_detail/ops-help @@ -0,0 +1 @@ +net: show socket detail /\*显示socket详细信息*/\ diff --git a/ops/interaction/net/show_socket_detail/ops-run b/ops/interaction/net/show_socket_detail/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..80139ac5855369e935a14478ded0e5d39735570d --- /dev/null +++ b/ops/interaction/net/show_socket_detail/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show socket detail\033[0m\n" + + #------ network ------ + ss -pl +} +main $* diff --git a/ops/interaction/net/sort_stat_protocol/ops-help b/ops/interaction/net/sort_stat_protocol/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b3dd9e292f7f6b77e99577cf38ead171f6065b25 --- /dev/null +++ b/ops/interaction/net/sort_stat_protocol/ops-help @@ -0,0 +1 @@ +net: sort to stat protocol /\*分类统计协议信息*/\ diff --git a/ops/interaction/net/sort_stat_protocol/ops-run b/ops/interaction/net/sort_stat_protocol/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..997ae7ab31aded3a20bb8162daf45d5ba568710a --- /dev/null +++ b/ops/interaction/net/sort_stat_protocol/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: sort to stat protocol\033[0m\n" + + #------ network ------ + netstat -sa +} +main $* diff --git a/ops/interaction/net/tx_from_epoll/ops-help b/ops/interaction/net/tx_from_epoll/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c3ce5e106420b532ce7d27ab9110606d6466ac2c --- /dev/null +++ b/ops/interaction/net/tx_from_epoll/ops-help @@ -0,0 +1 @@ +net: latency from packet sended to soft irq sended /\*进程发包到软中断开始发包之间的时延*/\ diff --git a/ops/interaction/net/tx_from_epoll/ops-run b/ops/interaction/net/tx_from_epoll/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ad2e7e30d4ed3453039e06a7bfff1a03f38c5861 --- /dev/null +++ b/ops/interaction/net/tx_from_epoll/ops-run @@ -0,0 +1,29 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: latency from packet sended to soft irq sended\033[0m\n" + printf "\033[32m-------------uses:oc-ops interaction net tx_from_epoll [delay], defalut 5s\033[0m\n" + + #------ network ------ + args=$# + delay=5 + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && delay=${!args} + + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -f do_epoll_wait -t net_tx_action -s 100 -fg 8 -de $delay -o 0 +} +main $* diff --git a/ops/interaction/net/udp_all_port/ops-help b/ops/interaction/net/udp_all_port/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..dd1fb047a49535e9c99214ecb59043b76cd348fb --- /dev/null +++ b/ops/interaction/net/udp_all_port/ops-help @@ -0,0 +1 @@ +net: show all udp ports /\*所有UDP端口*/\ \ diff --git a/ops/interaction/net/udp_all_port/ops-run b/ops/interaction/net/udp_all_port/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d5e421a476e8b3583c02186345798a47ab45d892 --- /dev/null +++ b/ops/interaction/net/udp_all_port/ops-run @@ -0,0 +1,22 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +###############################################h +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show all udp ports\033[0m\n" + + #------ network ------ + netstat -ua +} +main $* diff --git a/ops/interaction/ops-help b/ops/interaction/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..bf6a1a7111e9d3a34d3dd3e2b5ff91318abd87b6 --- /dev/null +++ b/ops/interaction/ops-help @@ -0,0 +1 @@ +HCI(human–computer interaction interface): help:oc-ops interaction diff --git a/ops/interaction/system/count_function/ops-help b/ops/interaction/system/count_function/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..791a7b6cfefc808a194844861c853ddd7724dc01 --- /dev/null +++ b/ops/interaction/system/count_function/ops-help @@ -0,0 +1 @@ +system: show function call count /\*统计指定函数的性能 */\ diff --git a/ops/interaction/system/count_function/ops-run b/ops/interaction/system/count_function/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ecb4375b85dfba74c2a63297a812279b10099d8d --- /dev/null +++ b/ops/interaction/system/count_function/ops-run @@ -0,0 +1,34 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show function call count\033[0m\n" + + #------ function performance ------ + if [ $# -lt 1 ]; then + echo -e "\033[32mneed \$func/[\$process]: oc-ops interaction system count_function \$function [\$process]\033[0m" + echo -e "\033[32mcount all \"os.*\" calls in libgo : oc-ops interaction system count_function go:os.* \033[0m" + echo -e "\033[32mcount \"read*\" calls in the ./test binary: oc-ops interaction system count_function ./test:read* \033[0m" + echo -e "\033[32mcount kernel fns starting with \"vfs\" : oc-ops interaction system count_function 'vfs_*' \033[0m" + echo -e "\033[32mcount calls to the sched_fork tracepoint: oc-ops interaction system count_function t:sched:sched_fork \033[0m" + exit 1 + fi + if [ $# -eq 1 ]; then + /usr/share/bcc/tools/funccount -Ti 5 $1 + else : + /usr/share/bcc/tools/funccount -Ti 5 $1 -p $2 + fi +} +main $* diff --git a/ops/interaction/system/function_performance/ops-help b/ops/interaction/system/function_performance/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..ed64828d078fb5b4aefdbe2f4710c9583ad9cb02 --- /dev/null +++ b/ops/interaction/system/function_performance/ops-help @@ -0,0 +1 @@ +system: show function performance /\*统计指定函数的性能 */\ diff --git a/ops/interaction/system/function_performance/ops-run b/ops/interaction/system/function_performance/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ae3cf427cda5e51ed51856cb9e577179d7f72fce --- /dev/null +++ b/ops/interaction/system/function_performance/ops-run @@ -0,0 +1,49 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[35m-------------find work: show function performance[if more function, sample:1/100]\033[0m\n" + printf "\033[32mt-ops interaction function_performance \$function [\$function] [\$time]\033[0m\n" + + #------ function performance ------ + if [ $# -lt 1 ]; then + echo -e "\033[32mneed: oc-ops interaction function_performance \$function [\$function] [\$time]\033[0m" + exit 1 + fi + + args=$# + delay=10 + has_delay=0 + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=${!args} + if [ $has_delay != 0 ]; then + delay=$has_delay + args=$((args - 1)) + fi + + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $args -eq 1 ]; then + $dir/oc-ops os_stat -f $1 -fg 1 -de $delay -o 0 + elif [ $args -eq 2 ]; then + $dir/oc-ops os_stat -f $1 -1 $2 -s 100 -fg 7 -de $delay -o 0 + elif [ $args -eq 3 ]; then + $dir/oc-ops os_stat -f $1 -1 $2 -2 $3 -s 100 -fg 7 -de $delay -o 0 + elif [ $args -eq 4 ]; then + $dir/oc-ops os_stat -f $1 -1 $2 -2 $3 -3 $4 -s 100 -fg 7 -de $delay -o 0 + elif [ $args -ge 5 ]; then + $dir/oc-ops os_stat -f $1 -1 $2 -2 $3 -3 $4 -4 $5 -s 100 -fg 7 -de $delay -o 0 + fi +} +main $* diff --git a/ops/interaction/system/hot_path/ops-help b/ops/interaction/system/hot_path/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..1150b7e54c6a2cab6f4abb2b9693ef949eae68f1 --- /dev/null +++ b/ops/interaction/system/hot_path/ops-help @@ -0,0 +1 @@ +system: show function hot path /\*输出函数调用热点链路*/\ diff --git a/ops/interaction/system/hot_path/ops-run b/ops/interaction/system/hot_path/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..2f238ed4fb9a7e3af7271e1e545326ef119b2caf --- /dev/null +++ b/ops/interaction/system/hot_path/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show function hot path\033[0m\n" + + #------ 热点链路 ------ + if [ $# -lt 3 ]; then + echo -e "\033[32mneed 3 parameter: [linux_dir] [\$function] [\$hot, val 1:1st exec, val 0: late uses this]\033[0m" + exit 1 + fi + func=$1 + linux_dir=$2 + hot=$3 + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -d $linux_dir -f $func -fg 0 -ht $hot -de 5 -o 0 -ft 1 +} +main $* diff --git a/ops/interaction/system/latency_between_two_function/get_data.py b/ops/interaction/system/latency_between_two_function/get_data.py new file mode 100755 index 0000000000000000000000000000000000000000..e26641050c797eaca0b079119c426d49e706093d --- /dev/null +++ b/ops/interaction/system/latency_between_two_function/get_data.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _func): + num=0 + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip not ave lines + if _ll.find(':' + _func) > 0: + i+=1 + if i < num: + continue + if _ll.find("num:") <= 0: + continue + + print(_ll) + + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _func = sys.argv[2]; + + _kv1 = parse_conf(_f1, _func); diff --git a/ops/interaction/system/latency_between_two_function/ops-help b/ops/interaction/system/latency_between_two_function/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..20f4b6e9ed231ccee8f5facc9ff9814bdebe2e42 --- /dev/null +++ b/ops/interaction/system/latency_between_two_function/ops-help @@ -0,0 +1 @@ +system: show latency between two functions /\*统计两个函数之间的时延*/\ diff --git a/ops/interaction/system/latency_between_two_function/ops-run b/ops/interaction/system/latency_between_two_function/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d4c39fb5624fb4f87a72489ac89a3d2e0178a998 --- /dev/null +++ b/ops/interaction/system/latency_between_two_function/ops-run @@ -0,0 +1,55 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show latency from:$1 to $2 [$3](linux dir for hotpath) [\$delay]\033[0m\n" + + if [ $# -lt 2 ]; then + printf "\033[32mneend manual entry function1 function2 [linux dir] [delay], such as\033[0m\n" + printf "\033[32moc-ops interaction system latency_between_two_function function1 function2 [/data/linux] [\$delay]\033[0m\n" + return + fi + + args=$# + delay=5 + has_delay=0 + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=${!args} + if [ $has_delay != 0 ]; then + delay=$has_delay + args=$((args - 1)) + fi + + dir=$(dirname $0) + tools_dir=${dir%ops*} + #------ get performance ------ + if [ $args -eq 3 ]; then + printf "\033[32m================ $1 hot path======================\033[0m\n" + printf "\033[32m================ $1 hot path======================\033[0m\n" + printf "\033[32m================ $1 hot path======================\033[0m\n" + $tools_dir/oc-ops os_stat -d $3 -f $1 -fg 0 -ht 0 -de $delay -o 0 -ft 1 &> 1.txt + $dir/get_data.py 1.txt $1 + printf "\033[32m================ $2 hot path======================\033[0m\n" + printf "\033[32m================ $2 hot path======================\033[0m\n" + printf "\033[32m================ $2 hot path======================\033[0m\n" + $tools_dir/oc-ops os_stat -d $3 -f $2 -fg 0 -ht 0 -de $delay -o 0 -ft 1 &> 1.txt + $dir/get_data.py 1.txt $2 + fi + + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + $tools_dir/oc-ops os_stat -f $1 -t $2 -s 100 -fg 8 -de $delay -o 0 +} +main $* diff --git a/ops/interaction/system/more_latency_between_funcions/1.txt b/ops/interaction/system/more_latency_between_funcions/1.txt new file mode 100644 index 0000000000000000000000000000000000000000..adea9975e1fc5c5d4c5e2a73d878b44e9d0a612c --- /dev/null +++ b/ops/interaction/system/more_latency_between_funcions/1.txt @@ -0,0 +1,22 @@ +make: Entering directory '/usr/src/kernels/5.4.241-1-tlinux4-0017.10.debug.0903.020' + Building modules, stage 2. + MODPOST 1 modules + LD [M] /usr/lib/tencentos-tools/ops/os_stat/os_stat/os_aware.ko +make: Leaving directory '/usr/src/kernels/5.4.241-1-tlinux4-0017.10.debug.0903.020' +make: 'os_stat_blongm' is up to date. +make: 'os_stat_data' is up to date. +make: 'os_stat_test' is up to date. +make: 'os_stat_scene_nohook' is up to date. +cflow version: cflow (GNU cflow) 1.7 +Copyright (C) 2005-2021 Sergey Poznyakoff +License GPLv3+: GNU GPL version 3 or later . +This is free software: you are free to change and redistribute it. +There is NO WARRANTY, to the extent permitted by law. + +Written by Sergey Poznyakoff. +install os stat kernel module: os_aware.ko successfully +find: ‘submit_bio’: No such file or directory +Traceback (most recent call last): + File "./change_resume.py", line 11, in + fin = open(_f1, "rt") +FileNotFoundError: [Errno 2] No such file or directory: 'func_tree_xx.txt' diff --git a/ops/interaction/system/more_latency_between_funcions/get_data.py b/ops/interaction/system/more_latency_between_funcions/get_data.py new file mode 100755 index 0000000000000000000000000000000000000000..e26641050c797eaca0b079119c426d49e706093d --- /dev/null +++ b/ops/interaction/system/more_latency_between_funcions/get_data.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _func): + num=0 + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip not ave lines + if _ll.find(':' + _func) > 0: + i+=1 + if i < num: + continue + if _ll.find("num:") <= 0: + continue + + print(_ll) + + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _func = sys.argv[2]; + + _kv1 = parse_conf(_f1, _func); diff --git a/ops/interaction/system/more_latency_between_funcions/ops-help b/ops/interaction/system/more_latency_between_funcions/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..20f4b6e9ed231ccee8f5facc9ff9814bdebe2e42 --- /dev/null +++ b/ops/interaction/system/more_latency_between_funcions/ops-help @@ -0,0 +1 @@ +system: show latency between two functions /\*统计两个函数之间的时延*/\ diff --git a/ops/interaction/system/more_latency_between_funcions/ops-run b/ops/interaction/system/more_latency_between_funcions/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..9685cb87ca961ddfdd20574e014695f4e94097e2 --- /dev/null +++ b/ops/interaction/system/more_latency_between_funcions/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show latency from:$1 to $2 [$3](linux dir for hotpath) [\$delay]\033[0m\n" + + if [ $# -lt 2 ]; then + printf "\033[32mneend manual entry function1 function2 [linux dir] [delay], such as\033[0m\n" + printf "\033[32mt-ops interaction system latency_between_two_function function1 function2 [/data/linux] [\$delay]\033[0m\n" + return + fi + + args=$# + delay=5 + has_delay=0 + func={} + echo "${!args}"|[ -n "`sed -n '/^[0-9][0-9]*$/p'`" ] && has_delay=${!args} + if [ $has_delay != 0 ]; then + delay=$has_delay + args=$(($args - 1)) + fi + j=1 + while [[ $# -gt 0 ]]; do + if [ $# == 1 ]; then + func[0]="-f $1" + else : + func[$#]="-$j $1" + fi + j=$(($j + 1)) + shift # past argument + done + dir=$(dirname $0) + #------ get performance ------ + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + printf "\033[32m================ latency between $1 and $2======================\033[0m\n" + echo ${func[@]} + t-ops os_stat ${func[@]} -s 100 -fg 8 -de $delay -o 0 +} +main $* diff --git a/ops/interaction/system/ops-help b/ops/interaction/system/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..19ead1482527998793241ef87774113a404e0e8b --- /dev/null +++ b/ops/interaction/system/ops-help @@ -0,0 +1 @@ +HCI: system performance and kernel tools diff --git a/ops/interaction/system/process_tool_htop/ops-help b/ops/interaction/system/process_tool_htop/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..39d790bdb93f929889ead8fa6fb493cf7ca38293 --- /dev/null +++ b/ops/interaction/system/process_tool_htop/ops-help @@ -0,0 +1 @@ +system: process tools, such as htop /\*进程监控工具,类似top*/\ diff --git a/ops/interaction/system/process_tool_htop/ops-run b/ops/interaction/system/process_tool_htop/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..3c6e04af4330c9aa1b8e36925436b1ca764eaef1 --- /dev/null +++ b/ops/interaction/system/process_tool_htop/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: process tools\033[0m\n" + + #------ htop ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops kernel-tools htop $* +} +main $* diff --git a/ops/interaction/system/read_performance/ops-help b/ops/interaction/system/read_performance/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b7327631f74ee9afc92c896fee496625cbd527b5 --- /dev/null +++ b/ops/interaction/system/read_performance/ops-help @@ -0,0 +1 @@ +system: show read performance /\*统计读性能*/\ diff --git a/ops/interaction/system/read_performance/ops-run b/ops/interaction/system/read_performance/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ad01e2f310dc915826de9a29a55124ffcc01389c --- /dev/null +++ b/ops/interaction/system/read_performance/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show read performance\033[0m\n" + + #------ read performance ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -f vfs_read -fg 1 -de 5 -o 0 +} +main $* diff --git a/ops/interaction/system/rubygc/ops-help b/ops/interaction/system/rubygc/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a091194068bbcd31e688d0dbbc36092ce22efc59 --- /dev/null +++ b/ops/interaction/system/rubygc/ops-help @@ -0,0 +1 @@ +system: summarize garbage collection events(java,node,python,ruby) /\*统计高级语言gc回收事件*/\ diff --git a/ops/interaction/system/rubygc/ops-run b/ops/interaction/system/rubygc/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..4fdb9dc288c03337d3b89420dfe4bf11e45825ee --- /dev/null +++ b/ops/interaction/system/rubygc/ops-run @@ -0,0 +1,35 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: summarize garbage collection events(java,node,python,ruby)\033[0m\n" + + #------ function performance ------ + if [ $# -lt 2 ]; then + echo -e "\033[32mneed \$language,\$process: oc-ops interaction system rubygc \$language \$process [\$interval]\033[0m" + echo -e "\033[32mif [\$interval], only show GCs longer than this many milliseconds\033[0m" + echo -e "\033[32msuch as(pid=100): oc-ops interaction system rubygc java 100 [10]\033[0m" + exit 1 + fi + echo -e "\033[32mneed \$language,\$process,[\$interval]: oc-ops interaction system rubygc \$function \$process\033[0m" + echo -e "\033[32mif [\$interval], only show GCs longer than this many milliseconds\033[0m" + + if [ $# -eq 2 ]; then + /usr/share/bcc/tools/rubygc -l $1 $2 -m + elif [ $# -eq 3 ]; then + /usr/share/bcc/tools/rubygc -l $1 $2 -m -M $3 + fi +} +main $* diff --git a/ops/interaction/system/rubyobjnew/ops-help b/ops/interaction/system/rubyobjnew/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..20f99096edaec3ef67bd788cc7635f30b3cd8f90 --- /dev/null +++ b/ops/interaction/system/rubyobjnew/ops-help @@ -0,0 +1 @@ +system: summarize object allocations(c, java,ruby,tcl) /\*统计高级语言内存对象分配情况*/\ diff --git a/ops/interaction/system/rubyobjnew/ops-run b/ops/interaction/system/rubyobjnew/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..895fa55f88a4448b400d586287a787234c3d97cc --- /dev/null +++ b/ops/interaction/system/rubyobjnew/ops-run @@ -0,0 +1,35 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: summarize object allocations(c, java,ruby,tcl)\033[0m\n" + + #------ function performance ------ + if [ $# -lt 2 ]; then + echo -e "\033[32mneed \$language,\$process: oc-ops interaction system rubygc \$language \$process [\$top_count]\033[0m" + echo -e "\033[32msuch as(pid=100), show top 10: oc-ops interaction system rubyobjnew java 100 [10]\033[0m" + exit 1 + fi + echo -e "\033[32m1.need \$language,\$process,[\$top_count]: oc-ops interaction system rubygc \$function \$process\033[0m" + echo -e "\033[32m2.if [\$top_count]: number of most frequently allocated types to print\033[0m" + echo -e "\033[32m3.print each 5 seconds\033[0m" + + if [ $# -eq 2 ]; then + /usr/share/bcc/tools/rubyobjnew -l $1 $2 5 + elif [ $# -eq 3 ]; then + /usr/share/bcc/tools/rubyobjnew -l $1 $2 -C $3 5 + fi +} +main $* diff --git a/ops/interaction/system/signal_allsig/ops-help b/ops/interaction/system/signal_allsig/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..1613733b6565026acd2dd2958f52926af12f93f4 --- /dev/null +++ b/ops/interaction/system/signal_allsig/ops-help @@ -0,0 +1 @@ +system: show assigned signal info /\*输出指定信号信息,触发的进程,目标进程*/\ diff --git a/ops/interaction/system/signal_allsig/ops-run b/ops/interaction/system/signal_allsig/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..39b9ddd720ffed4de8a4571c906d33d116292c9e --- /dev/null +++ b/ops/interaction/system/signal_allsig/ops-run @@ -0,0 +1,31 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show signal SIGKILL info\033[0m\n" + + #------ SIGNAL information ------ + echo -e "\033[32muses: oc-ops interaction system signal_sigkill \$sig [\$pid]\033[0m" + echo -e "\033[32muses ctl-c to finish\033[0m" + + if [ $# -gt 1 ]; then + echo $2 > /proc/sys/os_aware/var_offset1 + fi + + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -f kill_something_info -fg 9 -de 1000 -sig $1 +} +main $* diff --git a/ops/interaction/system/signal_sigkill/ops-help b/ops/interaction/system/signal_sigkill/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..795ad28cd9d531fe198ca1eaedb06ea73542eff9 --- /dev/null +++ b/ops/interaction/system/signal_sigkill/ops-help @@ -0,0 +1 @@ +system: show signal SIGKILL info /\*输出sigkill信号信息,触发的进程,目标进程*/\ diff --git a/ops/interaction/system/signal_sigkill/ops-run b/ops/interaction/system/signal_sigkill/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..ff9bd00e441be01d02b85f812f5f9f3bd46ce4e1 --- /dev/null +++ b/ops/interaction/system/signal_sigkill/ops-run @@ -0,0 +1,35 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show signal SIGKILL info\033[0m\n" + + #------ SIGKILL information ------ + echo -e "\033[32muses: oc-ops interaction system signal_sigkill [pid]\033[0m" + echo -e "\033[32muses ctl-c to finish\033[0m" + + if [ $# == 1 ]; then + echo $1 > /proc/sys/os_aware/var_offset1 + fi + + mod=$(lsmod | grep os_aware) + if [ ! -z "$mod" ]; then + echo 0 > /proc/sys/os_aware/enable + sleep 1 + rmmod os_aware + fi + oc-ops os_stat -f kill_something_info -1 find_get_task_by_vpid -s 1 -fg 9 -de 1000 -sig 9 +} +main $* diff --git a/ops/interaction/system/stackcount/ops-help b/ops/interaction/system/stackcount/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..4368199f9398b2a65152755743f2c2c8796c6cbf --- /dev/null +++ b/ops/interaction/system/stackcount/ops-help @@ -0,0 +1 @@ +system: count call events and their stack traces /\*统计指定函数调用次数事件和相关调用栈*/\ diff --git a/ops/interaction/system/stackcount/ops-run b/ops/interaction/system/stackcount/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..22d5c30c7ef95a6cc5a65615f20646286c23b1f4 --- /dev/null +++ b/ops/interaction/system/stackcount/ops-run @@ -0,0 +1,41 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work:count events and their stack traces\033[0m\n" + printf "\033[32muses:oc-ops interaction cpu stackcount \$function [\$cpu] [-p \$pid]\033[0m\n" + printf "\033[32msuch as, tcp_send* :oc-ops interaction system stackcount 'tcp_send*'\033[0m\n" + printf "\033[32msuch as, c lib :oc-ops interaction system stackcount c:malloc\033[0m\n" + printf "\033[32msuch as, tracepoint:oc-ops interaction system stackcount t:sched:sched_switc\033[0m\n" + printf "\033[32msuch as, function :oc-ops interaction system stackcount submit_bio\033[0m\n" + #------ frequency ------ + if [ $# -lt 1 ]; then + printf "\033[31mneed target to trace, such as one function, see above\033[0m\n" + elif [ $# -eq 1 ]; then + printf "\033[32mtrace: $1, all cpu, all process\033[0m\n" + /usr/share/bcc/tools/stackcount -s -d -v -T $1 + elif [ $# -eq 2 ]; then + printf "\033[32mtrace: $1, and this CPU:$2 only\033[0m\n" + /usr/share/bcc/tools/stackcount -s -d -v -T -c $2 $1 + elif [ $# -eq 3 ]; then + printf "\033[32mtrace: $1,and this pid:$3 only\033[0m\n" + /usr/share/bcc/tools/stackcount -s -d -v -T -p $3 $1 + elif [ $# -eq 4 ]; then + printf "\033[32mtrace: $1,and cpu:$2 this pid:$4 only\033[0m\n" + /usr/share/bcc/tools/stackcount -s -d -v -T -p $4 $1 + fi + +} +main $* diff --git a/ops/interaction/system/system_performance/ops-help b/ops/interaction/system/system_performance/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..849a5f778fd1df552ebbd3a9081c0f769c042ad3 --- /dev/null +++ b/ops/interaction/system/system_performance/ops-help @@ -0,0 +1 @@ +system: all system performance /\*显示系统整体性能*/\ diff --git a/ops/interaction/system/system_performance/ops-run b/ops/interaction/system/system_performance/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..cb2ccb0b79136c481798cbc96cd570907169c440 --- /dev/null +++ b/ops/interaction/system/system_performance/ops-run @@ -0,0 +1,28 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: all system performance: oc-ops interaction system system_performance [delay]\033[0m\n" + + #------ system performance ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + if [ $# == 0 ]; then + $dir/oc-ops os_stat -fg 2 -de 10 + else + $dir/oc-ops os_stat -fg 2 -de $1 + fi +} +main $* diff --git a/ops/interaction/system/write_performance/ops-help b/ops/interaction/system/write_performance/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..df11325fd68ebc897fb70b645da430cb711fcf3e --- /dev/null +++ b/ops/interaction/system/write_performance/ops-help @@ -0,0 +1 @@ +system: show write performance /\*统计写性能*/\ diff --git a/ops/interaction/system/write_performance/ops-run b/ops/interaction/system/write_performance/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d3bb18e885a99514ccd87102d555f82be7a7c93d --- /dev/null +++ b/ops/interaction/system/write_performance/ops-run @@ -0,0 +1,24 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show write performance\033[0m\n" + + #------ write performance ------ + subdir=$(dirname $0) + dir=${subdir%ops*} + $dir/oc-ops os_stat -f vfs_write -fg 1 -de 5 -o 0 +} +main $* diff --git a/ops/io/latency/blkparse.sh b/ops/io/latency/blkparse.sh new file mode 100755 index 0000000000000000000000000000000000000000..3305f59bc15185ff0385556d846c6017581b013d --- /dev/null +++ b/ops/io/latency/blkparse.sh @@ -0,0 +1,41 @@ +#! +#Usage: ./blkparse.sh $fname $devname +blkparse ${2} -d sd-nvme.bin > /dev/null +btt -i sd-nvme.bin -l sd-nvme > btt.txt +safe_rm *.dat & +safe_rm sd-nvme.bin 1>/dev/null 2>&1 & + +line_num=$( grep -n "Device Q2Q Seek Information" btt.txt | awk -F : '{print $1}' ) +line_num=$(( $line_num - 1 )) +#clear +echo "" ; echo "***********************Block IO Traces Summary***********************" +head -n $line_num btt.txt +echo "***********************Block IO Summary End**************************"; echo "" + +cat btt.txt | grep Q2C > temp.txt + +if (( $find_exceed_io == 1 )); then + workingpath=$(pwd) + ../../find_exceed_queue.sh -p $workingpath -d $2 -Q $Q2C_threshold -D $D2C_threshold +fi + +Q2CAvg=`awk '{ print $3 }' ./temp.txt` + +if echo $Q2CAvg | grep -qe "^[-\?[0-9]\+\.\?[0-9]*$" +then + echo $Q2CAvg > /dev/null +else + Q2CAvg=0 +fi + +cd .. +if (( $remain_data == 0 )); then + safe_rm -rf "$1" + exit 0 +fi +tar -zcf ${1}_bak.tar.gz $1 +safe_rm -rf "$1" + +if [ $(echo "$Q2CAvg > $average_latency_threshold" | bc) -ne 0 ]; then + cp ${1}_bak.tar.gz ${1}_exceed.tar.gz +fi diff --git a/ops/io/latency/blktrace.sh b/ops/io/latency/blktrace.sh new file mode 100755 index 0000000000000000000000000000000000000000..86070fcd732a874441cf1331eae83144b03b11fd --- /dev/null +++ b/ops/io/latency/blktrace.sh @@ -0,0 +1,10 @@ +#! +../checksize.sh $2 & +while true; do + fname=`date +%Y-%m-%d-%H-%M-%S` + mkdir ${fname} 2>/dev/null + cd ${fname} + blktrace -d /dev/${1} -w $period_time > /dev/null + ../../blkparse.sh ${fname} ${1} & + cd .. +done diff --git a/ops/io/latency/checksize.sh b/ops/io/latency/checksize.sh new file mode 100755 index 0000000000000000000000000000000000000000..040c391d46165cdb7381e5e6d54925c6ef8319b0 --- /dev/null +++ b/ops/io/latency/checksize.sh @@ -0,0 +1,14 @@ +#! +while true; do + du `pwd` -s > temp.txt + size=`awk '{ print $1 }' ./temp.txt` + safe_rm temp.txt + + if [ $(echo "$size > $1" | bc) -ne 0 ]; then + ls -ltr | grep _bak | head -1 > folder.txt + folder=`awk '{ print $9 }' ./folder.txt` + safe_rm -rf "$folder" + safe_rm folder.txt + fi + sleep 60 +done diff --git a/ops/io/latency/find_exceed_queue.sh b/ops/io/latency/find_exceed_queue.sh new file mode 100755 index 0000000000000000000000000000000000000000..dfca434c7a38f58fc46ecc580ed356d2b90a276b --- /dev/null +++ b/ops/io/latency/find_exceed_queue.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +devname="" +workingpath=$(pwd) +Qthreshold=0 +Dthreshold=0 + +usage="\ + Usage: +./find_exceed_queue.sh [-d devname] [-p path] [-Q threshold] [-D threshold] [-h] +COMMAND-LINE Options: +-d ./find_exceed_queue.sh 脚本将要跳转过去的工作目录 +-Q IO Q2C(表示io Queued To Complete)耗时的阈值,支持浮点表示,单位为秒 +-D IO D2C(表示io Dispatched/Issued To Complete)耗时的阈值 ,支持浮点表示,单位为秒 +-h 显示./find_exceed_queue.sh的用法 +" +while getopts 'd:p:Q:D:h' OPT; do + case $OPT in + d) devname="$OPTARG" + ;; + p) workingpath="$OPTARG" + ;; + Q) Qthreshold="$OPTARG" + ;; + D) Dthreshold="$OPTARG" + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +cd "$workingpath" + +if [ ! -e blktrace.txt ]; then + blkparse -i $devname -o blktrace.txt 1>/dev/null +fi + +safe_rm *_without_C_sectors.txt *2C_exceed_sectors.txt *2C_exceed_blktrace.txt 2>/dev/null + +grep "C " blktrace.txt > queue_C.txt + +if [[ $Qthreshold != 0 ]]; then + grep "Q " blktrace.txt > queue_Q.txt + while read line + do + Qsector=$( echo $line | awk '{ print $8 }' ) + Qtime=$( echo $line | awk '{ print $4 }' ) + Csectorline=$( grep $Qsector queue_C.txt ) + Csector=$( echo $Csectorline | awk '{ print $8 }' ) + if [ -z $Csector ]; then + echo $Qsector >> Q_without_C_sectors.txt + continue + fi + + Ctime=$( echo $Csectorline | awk '{ print $4 }' ) + Q2Ctime=$( echo "$Ctime - $Qtime" | bc ) + if [ $(echo "$Q2Ctime > $Qthreshold" | bc) -ne 0 ]; then + echo $Qsector >> Q2C_exceed_sectors.txt + grep $Qsector blktrace.txt >> Q2C_exceed_blktrace.txt + echo "" >> Q2C_exceed_blktrace.txt + fi + done < queue_Q.txt + echo "" ; echo "****************************Q2C Exceed IO****************************" + cat Q2C_exceed_blktrace.txt 2>/dev/null +fi + +if [[ $Dthreshold != 0 ]]; then + grep "D " blktrace.txt > queue_D.txt + while read line + do + Dsector=$( echo $line | awk '{ print $8 }' ) + Dtime=$( echo $line | awk '{ print $4 }' ) + Csectorline=$( grep $Dsector queue_C.txt ) + Csector=$( echo $Csectorline | awk '{ print $8 }' ) + if [ -z $Csector ]; then + echo $Dsector >> D_without_C_sectors.txt + continue + fi + + Ctime=$( echo $Csectorline | awk '{ print $4 }' ) + D2Ctime=$( echo "$Ctime - $Dtime" | bc ) + if [ $(echo "$D2Ctime > $Dthreshold" | bc) -ne 0 ]; then + echo $Dsector >> D2C_exceed_sectors.txt + grep $Dsector blktrace.txt >> D2C_exceed_blktrace.txt + echo "" >> D2C_exceed_blktrace.txt + fi + done < queue_D.txt + echo "" ; echo "****************************D2C Exceed IO****************************" + cat D2C_exceed_blktrace.txt 2>/dev/null +fi +echo "**************************Get Exceed IO End**************************" diff --git a/ops/io/latency/kill.sh b/ops/io/latency/kill.sh new file mode 100755 index 0000000000000000000000000000000000000000..36439de018b77a301fb3979344c152eb0c2f51de --- /dev/null +++ b/ops/io/latency/kill.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +source lib.sh +set_cgroup_rights + +mylogdir="/data/oc-ops/io/latency" +mycurrdir="$(pwd)" +remain=0 + +while getopts 'l:r:h' OPT; do + case $OPT in + l) logdir="$OPTARG" + ;; + r) remain="$OPTARG" + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +if [[ $mycurrdir == $mylogdir && $remain == 0 ]]; then + rm -rf *.sh sd* nvme* vd* run_log 2>/dev/null +fi + + +kill_times=5 +echo "Killing previous blktrace processes, please wait $kill_times seconds ..." + +for ((i=0; i<$kill_times; i++)) +do + #echo killing previous blktrace... + pgid=`ps fjx | grep blktrace | grep '/dev' | grep -E 'nvme|sd|vd' | awk '{ print $3}'` + for i in $pgid + do + pid=`ps fjx | grep $i | awk '{ print $2}'` + for j in $pid + do + #echo killing ${j} + kill $j 1>/dev/null 2>&1 + done + done + sleep 1 +done diff --git a/ops/io/latency/lib.sh b/ops/io/latency/lib.sh new file mode 100755 index 0000000000000000000000000000000000000000..12dbfccf2218bcdcc853c3f18d0e24949195ffbf --- /dev/null +++ b/ops/io/latency/lib.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set_cgroup_rights() +{ + if [ -e /sys/fs/cgroup/cgroup.procs ]; then + ## cgroup v2 + echo "$PPID" > /sys/fs/cgroup/cgroup.procs + echo "$$" > /sys/fs/cgroup/cgroup.procs + else + if [ -e /sys/fs/cgroup/cpuset/cgroup.procs ]; then + ## cgroup v1 + echo "$PPID" > /sys/fs/cgroup/cpuset/cgroup.procs 2>/dev/null + echo "$$" > /sys/fs/cgroup/cpuset/cgroup.procs 2>/dev/null + fi + + + if [ -e /sys/fs/cgroup/memory/cgroup.procs ]; then + ## cgroup v1 + echo "$PPID" > /sys/fs/cgroup/memory/cgroup.procs 2>/dev/null + echo "$$" > /sys/fs/cgroup/memory/cgroup.procs 2>/dev/null + fi + fi +} + +export -f set_cgroup_rights diff --git a/ops/io/latency/ops-help b/ops/io/latency/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b097158b43cc49b3597975d892407d431313f833 --- /dev/null +++ b/ops/io/latency/ops-help @@ -0,0 +1 @@ +Check memory usages diff --git a/ops/io/latency/ops-run b/ops/io/latency/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..9d7f5e091669ba1c4a4bd3db1570c59c558610b7 --- /dev/null +++ b/ops/io/latency/ops-run @@ -0,0 +1 @@ +runall.sh \ No newline at end of file diff --git a/ops/io/latency/run.sh b/ops/io/latency/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..da186604da244e138061072075cdcfcb704a79f6 --- /dev/null +++ b/ops/io/latency/run.sh @@ -0,0 +1,6 @@ +#! +#echo usage : ./run.sh block_dev_name, such as ./run.sh nvme0n1 or ./run.sh sdb +mkdir $1 2>/dev/null +cd $1 +./../blktrace.sh $1 $2 +cd .. diff --git a/ops/io/latency/runall.sh b/ops/io/latency/runall.sh new file mode 100755 index 0000000000000000000000000000000000000000..67ceaa6abbd99c5c46052c5d076560fe340f8a25 --- /dev/null +++ b/ops/io/latency/runall.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +usage="\ +Usage: + oc-ops io latency -d device [-s size] [-l logdir] [-a average] [-m max] [-p period] [-Q threshold] [-D threshold] [-r] [-k] [-h] + COMMAND-LINE Options: + -d 指定监控的设备,如:sda ;多个设备之间用逗号相连,如:sda,sdb + -s 指定日志存储空间(针对-d参数中的每个设备)的大小,超过大小后,日志将自动回滚 (超过阈值的日志仍会保留), 默认1048576KB + -l, 指定日志存储目录,需按绝对路径传递存储目录;默认是 /data/oc-ops/io/latency + 为便于校验目录合法性、避免进入其他目录创建和删除文件,-l指定的路径中必须包含"io/latency"字符串 + -a io average latency的阈值,支持浮点表示,单位为秒,默认0.2s + -m io max latency的阈值,支持浮点表示,单位为秒,默认5s + -p 每个监控周期的时间,单位为秒,默认60s (备注:一个监控周期结束,会自动开始下一个周期的监控) + -Q IO Q2C(表示io Queued To Complete)耗时的阈值,支持浮点表示,单位为秒,默认与 -m 的数值相同 + -D IO D2C(表示io Dispatched/Issued To Complete)耗时的阈值 ,支持浮点表示,单位为秒,默认与 -m 的数值相同 + -r 保留中间过程的日志,使用此参数可便于调试或基于保留的日志更深入的分析IO时延问题 + 一但有 oc-ops io latency 命令没有带-r参数,则此次采样过程种的所有blktrace数据不会保留 + -k kill 所有oc-ops io latency监控io的进程,结束监控 + 使用-k时,如想保留blktrace数据进一步分析,也需加上-r参数; 否则将清空历次采用的所有的blktrace数据 + -h 显示oc-ops io latency的用法 + +使用注意项: + -l 指定的日志存储目录,尽量别在-d指定的监控的设备上,因为日志存储会产生io、加大被监控设备的io压力; + -l 指定的日志存储目录,尽量别指定内存文件系统(如tmpfs)所在的目录,因为在内存文件系统上存储日志会消耗内存,对业务产生干扰; + -d 可支持监控多个device,如果监控过多设备(特别是快速设备,如nvme),会对cpu占用率产生干扰。 +" + +if (( $# < 1 )); then + echo "$usage" + exit 1 +fi + +device_array=() +export perdevlog_store_size=1048576 +export logdir="/data/oc-ops/io/latency" +export average_latency_threshold=0.2 +export max_latency_threshold=5 +export period_time=60 +export Q2C_threshold=0 +export D2C_threshold=0 +export remain_data=0 +export find_exceed_io=0 + +export curr_dir=$(pwd) +export work_dir=$(readlink /proc/$$/fd/255); work_dir=$(dirname $work_dir); cd "$work_dir" +export run_log="$logdir/run_log" + +source lib.sh +set_cgroup_rights + +kill_sample=0 + +get_device_name() +{ + device_array="$@" +} + +check_logdir() +{ + echo "$logdir" | grep ^[/] > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Must using absolute path!"; return 1 ; fi + + echo "$logdir" | grep "io/latency" > /dev/null ; ret_val=$? + if (( $ret_val != 0 )); then echo "Path must including \"io/latency\" substring!"; return 1 ; fi + + echo "$logdir" | grep -E '[ | ]' > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Path must not including space and tab char!"; return 1 ; fi + + return 0 +} + +safe_rm() +{ + echo "$PWD/" | grep "$logdir" > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Forbid rm outside the $logdir dir!"; return 1 ; fi + + echo "$@" | grep "/" > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Forbid having \"/\" in args!"; return 1 ; fi + + rm $@ 2>/dev/null +} +export -f safe_rm + +while getopts 'd:s:l:a:m:p:Q:D:rkh' OPT; do + case $OPT in + d) param_tmp="$OPTARG" + get_device_name $( echo $param_tmp | sed 's/,/ /g' ) + ;; + s) perdevlog_store_size="$OPTARG" + ;; + l) logdir="$OPTARG" + check_logdir + if (( $? != 0)); then + echo "The dir \"$logdir\" is illegal!" + exit 1 + fi + ;; + a) average_latency_threshold="$OPTARG" + ;; + m) max_latency_threshold="$OPTARG" + ;; + p) period_time="$OPTARG" + ;; + Q) Q2C_threshold="$OPTARG" + find_exceed_io=1 + ;; + D) D2C_threshold="$OPTARG" + find_exceed_io=1 + ;; + r) remain_data=1 + ;; + k) kill_sample=1 + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +mkdir -p "$logdir" +cd "$logdir" && if [ -e kill.sh ]; then ./kill.sh -l "$logdir" -r $remain_data ; fi +if (( kill_sample == 1 )); then + exit 0 +fi + +## del it in the later +cp -a "$work_dir"/*.sh "$logdir" + +div=2 +half=`expr $perdevlog_store_size / $div` + +echo "Will display IO sampling result every $period_time, please wait..." +for devname in ${device_array[@]} +do + if [ -e /dev/$devname ]; then + ## run at here. + ./run.sh $devname $half & + else + echo "Error param! Device name is $devname, but there isn't /dev/$devname !" + echo "All params: $@" + exit 1 + fi +done diff --git a/ops/io/ops-help b/ops/io/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a791fcb07a405d32fd94c22301f765141b3e0ed5 --- /dev/null +++ b/ops/io/ops-help @@ -0,0 +1 @@ +IO latency relative tools diff --git a/ops/io/s_iostat/ops-help b/ops/io/s_iostat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c06de9c4c835dd6d4a82c7b44e9b078b8400a895 --- /dev/null +++ b/ops/io/s_iostat/ops-help @@ -0,0 +1 @@ +iostat: report disk io performance, support device level read/wirte speed stat. diff --git a/ops/io/s_iostat/ops-run b/ops/io/s_iostat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..d8c5dd80554ea419514ba6df97427e48287e2d50 --- /dev/null +++ b/ops/io/s_iostat/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} + +usage="\ +Usage: + t-ops io iostat [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + iostat --h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} + +function main() +{ + + + if [ $# -ge 1 ]; then + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + fi + + iostat $@ +} + +main $* diff --git a/ops/kdump/kdump.conf b/ops/kdump/kdump.conf new file mode 100644 index 0000000000000000000000000000000000000000..dea2e94861d436bfb0bf4ed96952a23c3b6887e5 --- /dev/null +++ b/ops/kdump/kdump.conf @@ -0,0 +1,185 @@ +# This file contains a series of commands to perform (in order) in the kdump +# kernel after a kernel crash in the crash kernel(1st kernel) has happened. +# +# Directives in this file are only applicable to the kdump initramfs, and have +# no effect once the root filesystem is mounted and the normal init scripts are +# processed. +# +# Currently, only one dump target and path can be specified. If the dumping to +# the configured target fails, the failure action which can be configured via +# the "failure_action" directive will be performed. +# +# Supported options: +# +# raw +# - Will dd /proc/vmcore into . +# Use persistent device names for partition devices, +# such as /dev/vg/. +# +# nfs +# - Will mount nfs to , and copy /proc/vmcore to +# //%HOST-%DATE/, supports DNS. +# +# ssh +# - Will save /proc/vmcore to :/%HOST-%DATE/, +# supports DNS. +# NOTE: make sure the user has write permissions on the server. +# +# sshkey +# - Will use the sshkey to do ssh dump. +# Specify the path of the ssh key to use when dumping +# via ssh. The default value is /root/.ssh/kdump_id_rsa. +# +# +# - Will mount -t , and copy +# /proc/vmcore to //%HOST_IP-%DATE/. +# NOTE: can be a device node, label or uuid. +# It's recommended to use persistent device names +# such as /dev/vg/. +# Otherwise it's suggested to use label or uuid. +# +# path +# - "path" represents the file system path in which vmcore +# will be saved. If a dump target is specified in +# kdump.conf, then "path" is relative to the specified +# dump target. +# +# Interpretation of "path" changes a bit if the user didn't +# specify any dump target explicitly in kdump.conf. In this +# case, "path" represents the absolute path from root. The +# dump target and adjusted path are arrived at automatically +# depending on what's mounted in the current system. +# +# Ignored for raw device dumps. If unset, will use the default +# "/var/crash". +# +# core_collector +# - This allows you to specify the command to copy +# the vmcore. The default is makedumpfile, which on +# some architectures can drastically reduce vmcore size. +# See /sbin/makedumpfile --help for a list of options. +# Note that the -i and -g options are not needed here, +# as the initrd will automatically be populated with a +# config file appropriate for the running kernel. +# The default core_collector for raw/ssh dump is: +# "makedumpfile -F -l --message-level 7 -d 31". +# The default core_collector for other targets is: +# "makedumpfile -l --message-level 7 -d 31". +# +# "makedumpfile -F" will create a flattened vmcore. +# You need to use "makedumpfile -R" to rearrange the dump data to +# a normal dumpfile readable with analysis tools. For example: +# "makedumpfile -R vmcore < vmcore.flat". +# +# For core_collector format details, you can refer to +# kexec-kdump-howto.txt or kdump.conf manpage. +# +# kdump_post +# - This directive allows you to run a executable binary +# or script after the vmcore dump process terminates. +# The exit status of the current dump process is fed to +# the executable binary or script as its first argument. +# All files under /etc/kdump/post.d are collectively sorted +# and executed in lexical order, before binary or script +# specified kdump_post parameter is executed. +# +# kdump_pre +# - Works like the "kdump_post" directive, but instead of running +# after the dump process, runs immediately before it. +# Exit status of this binary is interpreted as follows: +# 0 - continue with dump process as usual +# non 0 - run the final action (reboot/poweroff/halt) +# All files under /etc/kdump/pre.d are collectively sorted and +# executed in lexical order, after binary or script specified +# kdump_pre parameter is executed. +# Even if the binary or script in /etc/kdump/pre.d directory +# returns non 0 exit status, the processing is continued. +# +# extra_bins +# - This directive allows you to specify additional binaries or +# shell scripts to be included in the kdump initrd. +# Generally they are useful in conjunction with a kdump_post +# or kdump_pre binary or script which depends on these extra_bins. +# +# extra_modules +# - This directive allows you to specify extra kernel modules +# that you want to be loaded in the kdump initrd. +# Multiple modules can be listed, separated by spaces, and any +# dependent modules will automatically be included. +# +# failure_action +# - Action to perform in case dumping fails. +# reboot: Reboot the system. +# halt: Halt the system. +# poweroff: Power down the system. +# shell: Drop to a bash shell. +# Exiting the shell reboots the system by default, +# or perform "final_action". +# dump_to_rootfs: Dump vmcore to rootfs from initramfs context and +# reboot by default or perform "final_action". +# Useful when non-root dump target is specified. +# The default option is "reboot". +# +# default +# - Same as the "failure_action" directive above, but this directive +# is obsolete and will be removed in the future. +# +# final_action +# - Action to perform in case dumping succeeds. Also performed +# when "shell" or "dump_to_rootfs" failure action finishes. +# Each action is same as the "failure_action" directive above. +# The default is "reboot". +# +# force_rebuild <0 | 1> +# - By default, kdump initrd will only be rebuilt when necessary. +# Specify 1 to force rebuilding kdump initrd every time when kdump +# service starts. +# +# force_no_rebuild <0 | 1> +# - By default, kdump initrd will be rebuilt when necessary. +# Specify 1 to bypass rebuilding of kdump initrd. +# +# force_no_rebuild and force_rebuild options are mutually +# exclusive and they should not be set to 1 simultaneously. +# +# override_resettable <0 | 1> +# - Usually an unresettable block device can't be a dump target. +# Specifying 1 when you want to dump even though the block +# target is unresettable +# By default, it is 0, which will not try dumping destined to fail. +# +# dracut_args +# - Pass extra dracut options when rebuilding kdump initrd. +# +# fence_kdump_args +# - Command line arguments for fence_kdump_send (it can contain +# all valid arguments except hosts to send notification to). +# +# fence_kdump_nodes +# - List of cluster node(s) except localhost, separated by spaces, +# to send fence_kdump notifications to. +# (this option is mandatory to enable fence_kdump). +# + +#raw /dev/vg/lv_kdump +#ext4 /dev/vg/lv_kdump +#ext4 LABEL=/boot +#ext4 UUID=03138356-5e61-4ab3-b58e-27507ac41937 +#nfs my.server.com:/export/tmp +#nfs [2001:db8::1:2:3:4]:/export/tmp +#ssh user@my.server.com +#ssh user@2001:db8::1:2:3:4 +#sshkey /root/.ssh/kdump_id_rsa +path /var/crash +core_collector makedumpfile -l --message-level 7 -d 31 +#core_collector scp +#kdump_post /var/crash/scripts/kdump-post.sh +#kdump_pre /var/crash/scripts/kdump-pre.sh +#extra_bins /usr/bin/lftp +#extra_modules gfs2 +#failure_action shell +#force_rebuild 1 +#force_no_rebuild 1 +#dracut_args --omit-drivers "cfg80211 snd" --add-drivers "ext2 ext3" +#fence_kdump_args -p 7410 -f auto -c 0 -i 10 +#fence_kdump_nodes node1 node2 diff --git a/ops/kdump/ops-help b/ops/kdump/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c9485397f57dbcb763177b86065235eddc2d7a8d --- /dev/null +++ b/ops/kdump/ops-help @@ -0,0 +1 @@ +Check kdump service status diff --git a/ops/kdump/ops-run b/ops/kdump/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..c7ae56c388c3fa935cea43fb37e459d62dd92c1e --- /dev/null +++ b/ops/kdump/ops-run @@ -0,0 +1,182 @@ +#!/bin/bash + +KDUMP_CONFIG_FILE="/etc/kdump.conf" + +# Read from kdump config file stripping all comments +read_strip_comments() +{ + # strip heading spaces, and print any content starting with + # neither space or #, and strip everything after # + sed -n -e "s/^\s*\([^# \t][^#]\+\).*/\1/gp" $1 +} + +strip_comments() +{ + echo $@ | sed -e 's/\(.*\)#.*/\1/' +} + +# get_option_value +# retrieves value of option defined in kdump.conf +get_option_value() { + strip_comments `grep "^$1[[:space:]]\+" /etc/kdump.conf | tail -1 | cut -d\ -f2-` +} + +check_failure_action_config() +{ + local default_option + local failure_action + local option="failure_action" + + default_option=$(awk '$1 ~ /^default$/ {print $2;}' $KDUMP_CONFIG_FILE) + failure_action=$(awk '$1 ~ /^failure_action$/ {print $2;}' $KDUMP_CONFIG_FILE) + + if [ -z "$failure_action" -a -z "$default_option" ]; then + return 0 + elif [ -n "$failure_action" -a -n "$default_option" ]; then + echo "Cannot specify 'failure_action' and 'default' option together" + return 1 + fi + + if [ -n "$default_option" ]; then + option="default" + failure_action="$default_option" + fi + + case "$failure_action" in + reboot|halt|poweroff|shell|dump_to_rootfs) + return 0 + ;; + *) + echo $"Usage kdump.conf: $option {reboot|halt|poweroff|shell|dump_to_rootfs}" + return 1 + esac +} + +check_final_action_config() +{ + local final_action + + final_action=$(awk '$1 ~ /^final_action$/ {print $2;}' $KDUMP_CONFIG_FILE) + if [ -z "$final_action" ]; then + return 0 + else + case "$final_action" in + reboot|halt|poweroff) + return 0 + ;; + *) + echo $"Usage kdump.conf: final_action {reboot|halt|poweroff}" + return 1 + esac + fi +} + +check_fence_kdump_config() +{ + local hostname=`hostname` + local ipaddrs=`hostname -I` + local nodes=$(get_option_value "fence_kdump_nodes") + + for node in $nodes; do + if [ "$node" = "$hostname" ]; then + echo "Option fence_kdump_nodes cannot contain $hostname" + return 1 + fi + # node can be ipaddr + echo $ipaddrs | grep $node > /dev/null + if [ $? -eq 0 ]; then + echo "Option fence_kdump_nodes cannot contain $node" + return 1 + fi + done + + return 0 +} + +check_kdump_config() +{ + local nr + + nr=$(awk 'BEGIN{cnt=0} /^raw|^ssh[[:blank:]]|^nfs|^ext[234]|^xfs|^btrfs|^minix|^dracut_args .*\-\-mount/{cnt++} END{print cnt}' $KDUMP_CONFIG_FILE) + [ $nr -gt 1 ] && { + echo "More than one dump targets specified." + return 1 + } + + nr=$(grep "^dracut_args .*\-\-mount" $KDUMP_CONFIG_FILE | grep -o "\-\-mount" | wc -l) + [ $nr -gt 1 ] && { + echo "Multiple mount targets specified in one \"dracut_args\"." + return 1 + } + + # Check if we have any leading spaces (or tabs) before the + # variable name in the kdump conf file + if grep -E -q '^[[:blank:]]+[a-z]' $KDUMP_CONFIG_FILE; then + echo "No whitespaces are allowed before a kdump option name in $KDUMP_CONFIG_FILE" + return 1 + fi + + while read config_opt config_val; do + case "$config_opt" in + \#* | "") + ;; + raw|ext2|ext3|ext4|minix|btrfs|xfs|nfs|ssh|sshkey|path|core_collector|kdump_post|kdump_pre|extra_bins|extra_modules|default|force_rebuild|force_no_rebuild|dracut_args|fence_kdump_args|fence_kdump_nodes) + [ -z "$config_val" ] && { + echo "Invalid kdump config value for option $config_opt." + return 1; + } + if [ -d "/proc/device-tree/ibm,opal/dump" ] && [ "$config_opt" == "raw" ]; then + echo "WARNING: Won't capture opalcore when 'raw' dump target is used." + fi + ;; + net|options|link_delay|disk_timeout|debug_mem_level|blacklist) + echo "Deprecated kdump config option: $config_opt. Refer to kdump.conf manpage for alternatives." + return 1 + ;; + *) + echo "Invalid kdump config option $config_opt" + return 1; + ;; + esac + done <<< "$(read_strip_comments $KDUMP_CONFIG_FILE)" + + check_failure_action_config || return 1 + check_final_action_config || return 1 + check_fence_kdump_config || return 1 + + return 0 +} + + +check_kdump_status(){ + systemctl status kdump >/dev/null + return $? +} + +ops_run(){ + check_kdump_status + if [ $? -ne 0 ]; then + check_kdump_config + if [ $? -ne 0 ]; then + echo "/etc/kdump.conf check fails, /etc/kdump.conf will be overwritten with default configuration, old configuration is archived in /etc/kdump.conf.old" + cp /etc/kdump.conf /etc/kdump.conf.old + cp -f /usr/lib/opencloudos-tools/ops/kdump/kdump.conf /etc/kdump.conf + systemctl restart kdump >/dev/null + if [ $? -ne 0 ]; then + echo 'The kdump service status is abnormal after restart, configuration check is passed, there may be other reasons for the service failure, See "systemctl status kdump.service" and "journalctl -xe" for details.' + return 1 + fi + echo "The kdump service status is normal with default conf." + return 0 + fi + systemctl restart kdump >/dev/null + if [ $? -ne 0 ]; then + echo 'The kdump service status is abnormal, configuration check is passed, there may be other reasons for the service failure, See "systemctl status kdump.service" and "journalctl -xe" for details.' + return 1 + fi + fi + echo "kdump service is normal." + return 0 +} + +ops_run diff --git a/ops/kernel-tools/cpupower/ops-help b/ops/kernel-tools/cpupower/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..aa4a742937d6c185f3ca9b05a00804f2097d59f1 --- /dev/null +++ b/ops/kernel-tools/cpupower/ops-help @@ -0,0 +1 @@ +cpupower: check cpu frequency, power .etc, whick could manage cpu frequency governor, c-states .etc diff --git a/ops/kernel-tools/cpupower/ops-run b/ops/kernel-tools/cpupower/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..54f8d1fe3d1dc0d109d288c70147cec2cb9e21a4 --- /dev/null +++ b/ops/kernel-tools/cpupower/ops-run @@ -0,0 +1,17 @@ +#!/bin/sh +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + /usr/bin/cpupower $* +} + +main $* diff --git a/ops/kernel-tools/htop/ops-help b/ops/kernel-tools/htop/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..a53405dbc9975c839235219327b21d5960793e82 --- /dev/null +++ b/ops/kernel-tools/htop/ops-help @@ -0,0 +1 @@ +htop: stat process status, like ps and top diff --git a/ops/kernel-tools/htop/ops-run b/ops/kernel-tools/htop/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..34859d1eea782c06344859671a66788099d83fcb --- /dev/null +++ b/ops/kernel-tools/htop/ops-run @@ -0,0 +1,19 @@ +#!/bin/sh + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + + installed=$(strstr "rpm -qa" htop ) + if [ -z "$installed" ] ; then + yum install htop -y + fi + + htop $* +} + +main $* diff --git a/ops/kernel-tools/ops-help b/ops/kernel-tools/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..d2f82f9a8127710bddfaf4921861468e3130ba5d --- /dev/null +++ b/ops/kernel-tools/ops-help @@ -0,0 +1 @@ +kernel-tools: turbostat/cpupower diff --git a/ops/kernel-tools/turbostat/ops-help b/ops/kernel-tools/turbostat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..3763d8a89699d6af62ef911d778dd4149f996694 --- /dev/null +++ b/ops/kernel-tools/turbostat/ops-help @@ -0,0 +1 @@ +turbostat: check intel and amd cpu: power, frequency, cpu register .etc diff --git a/ops/kernel-tools/turbostat/ops-run b/ops/kernel-tools/turbostat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..15bcaa1bce7346941a82a9cd8932bbc123ad4ab3 --- /dev/null +++ b/ops/kernel-tools/turbostat/ops-run @@ -0,0 +1,18 @@ +#!/bin/sh +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + printf "\033[32mturbostat: check intel and amd cpu: power, frequency, cpu register .etc\033[0m\n" + /usr/bin/turbostat +} + +main $* diff --git a/ops/mem/checkcost/checkcost.sh b/ops/mem/checkcost/checkcost.sh new file mode 100755 index 0000000000000000000000000000000000000000..bf3a5855a13c05c72d22b1f130e145fe2cd4f5ba --- /dev/null +++ b/ops/mem/checkcost/checkcost.sh @@ -0,0 +1,139 @@ +#!/bin/bash +############################################### +# File Name : mem_scam.sh +# Version : V1.0 +# Auther : frankjpliu@tencent.com kkingzhang@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +drop_caches=0 +mem_cost_topn=3 + +usage="\ +Usage: + oc-ops mem checkcost [-n topn] [-d] [-h] + COMMAND-LINE Options: + -n topn表示前topn的内存耗用项,默认显示 top 3 的内存耗用项 + -d 先drop cache (echo 3 > /proc/sys/vm/drop_caches),再进行内存消耗检测 + -h 显示oc-ops mem checkcost的用法,并退出 +" + +function get_all_process_oorderbymem_res() +{ + top -b -o "%MEM" -n 1 | head -n 60 +} + +function get_slabtop() +{ + slabtop -o 1 -s l | head -n 60 +} + +function tmpfs_analysis() +{ + if [ $# -gt 0 ] && [ $0 -gt 0 ];then + mem_total=$0 + for item in `df -k | grep tmpfs` + do + tmpfs_usage=`echo $item | awk '{print $2}'` + percent=`echo | awk '{tmp=(('$tmpfs_usage'/'$mem_total')*100);if (tmp >= 5.0){print 1} else {print 0}}'` + if [ $percent -eq 1 ];then + # >= 5% + echo $item + fi + done + else + echo "confirm total mem!" + fi +} + +function vmallocinfo_analysis() +{ + awk '{s[$3] += $2}END{ for(i in s){ print i, ":" ,s[i] } }' /proc/vmallocinfo +} + +function get_process_hugetlbfs() +{ + for pid in `grep "KernelPageSize: 2048 kB" /proc/*/smaps 2>/dev/null | awk '{print $1}' | cut -d "/" -f3 | sort | uniq` + do + tmp=`echo $pid | grep -e [0-9]` + if [ "X$tmp" != "X" ];then + total=`grep -B 11 'KernelPageSize: 2048 kB' /proc/$pid/smaps 2>/dev/null | grep '^Size:' | awk 'BEGIN{sum=0}{sum+=$2}END{print sum/1024}'` + pname=`head -n 1 /proc/$pid/stat 2>/dev/null | cut -d " " -f 2 | tr -d "()"` + echo "" | awk '{printf ("%20s ","'$pname'");printf ("%20s ","'$pid'");printf ("%20s\r\n", "'$total'")}' + fi + done +} + +function mem_usage_detail() +{ + local mem_total=0 + + if (( $1 == 1 )); then + echo 3 > /proc/sys/vm/drop_caches + fi + + topn_ajusted=$(( $mem_cost_topn + 1 )) + + for item in `awk '{if ($1=="MemTotal:"){mem_total=$2;sub(/:/,"",$1);print mem_total":"$1};if ($1=="AnonPages:"){sub(/:/,"",$1);print $2":"$1};if ($1=="Shmem:"){sub(/:/,"",$1);print $2":"$1};if ($1=="Slab:"){sub(/:/,"",$1);print $2":"$1};if ($1=="VmallocUsed:"){sub(/:/,"",$1);print $2":"$1};if ($1=="HugePages_Total:"){sub(/:/,"",$1);print $2":"$1};if ($1=="Buffers:"){sub(/:/,"",$1);print $2":"$1};if ($1=="Cached:"){sub(/:/,"",$1);print $2":"$1}}' /proc/meminfo | sort -nr | head -n $topn_ajusted` + do + domain=`echo $item | awk -F ":" '{print $NF}'` + echo "$domain" + case $domain in + "Cached") + value=`echo $item | awk -F ":" '{print $1}'` + percent=`echo $value $mem_total | awk '{printf ("%2.2f%",int($1)/int($2)*100)}'` + echo -e "Cached / mem_total: ${RED} $percent ${NC}" + ;; + "Buffers") + value=`echo $item | awk -F ":" '{print $1}'` + percent=`echo $value $mem_total | awk '{printf ("%2.2f%",int($1)/int($2)*100)}'` + echo -e "Buffers / mem_total: ${RED} $percent ${NC}" + ;; + "AnonPages") + get_all_process_oorderbymem_res + ;; + "Shmem") + tmpfs_analysis + ;; + "Slab") + get_slabtop + ;; + "VmallocUsed") + vmallocinfo_analysis $mem_total + ;; + "HugePages_Total") + get_process_hugetlbfs + ;; + "MemTotal") + mem_total=`echo $item | awk -F ":" '{print $1}'` + echo "mem_total: $mem_total" + ;; + *) + ;; + esac + echo "" + echo "-----------------------------------------------------------" + done +} + +while getopts 'n:dh' OPT; do + case $OPT in + n) mem_cost_topn="$OPTARG" + ;; + d) drop_caches=1 + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +mem_usage_detail $drop_caches diff --git a/ops/mem/checkcost/ops-help b/ops/mem/checkcost/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b097158b43cc49b3597975d892407d431313f833 --- /dev/null +++ b/ops/mem/checkcost/ops-help @@ -0,0 +1 @@ +Check memory usages diff --git a/ops/mem/checkcost/ops-run b/ops/mem/checkcost/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..ff1c5a9425010e871b86b580583d56b58dcafc41 --- /dev/null +++ b/ops/mem/checkcost/ops-run @@ -0,0 +1 @@ +checkcost.sh \ No newline at end of file diff --git a/ops/mem/enter_memleak/ops-help b/ops/mem/enter_memleak/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..781306ab03a45c7164700c020209c465e29ac311 --- /dev/null +++ b/ops/mem/enter_memleak/ops-help @@ -0,0 +1 @@ +eBPF: memory_leak_kernel, see helps: t-ops mem memleak -h; driverd by eBPF diff --git a/ops/mem/enter_memleak/ops-run b/ops/mem/enter_memleak/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..204a55231617cf7215c86aec3758c99f4589b4b8 --- /dev/null +++ b/ops/mem/enter_memleak/ops-run @@ -0,0 +1,27 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +drop_caches=0 +mem_cost_topn=3 + +usage="\ +Usage: + t-ops mem memleak [-h] +" +function main() +{ + if [ ! -d "/usr/share/bcc/tools" ]; then + sudo yum install bcc-tools + fi + /usr/share/bcc/tools/memory_leak_kernel $@ +} + +main $* diff --git a/ops/mem/enter_memstrack/ops-help b/ops/mem/enter_memstrack/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..53906ae023638967b3409cb166b131dd88dd526b --- /dev/null +++ b/ops/mem/enter_memstrack/ops-help @@ -0,0 +1 @@ +memstrack: trace memory allocation and stack analysis .etc, can do: memleak/oom/performance diff --git a/ops/mem/enter_memstrack/ops-run b/ops/mem/enter_memstrack/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..252c71c233bd755699da41064408447cd80879dd --- /dev/null +++ b/ops/mem/enter_memstrack/ops-run @@ -0,0 +1,15 @@ +#!/bin/sh + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + dir=$(dirname $0) + printf "\033[32mresult:$dir/report_result\033[0m\n" + memstrack --notui --report task_top --output /usr/lib/tencentos-tools/ops/mem/enter_memstrack/report_result +} + +main $* diff --git a/ops/mem/ops-help b/ops/mem/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..63fe6d71cffa70da112a33d81a2f363454b90557 --- /dev/null +++ b/ops/mem/ops-help @@ -0,0 +1 @@ +Memory relative tools diff --git a/ops/mem/vmstat/ops-help b/ops/mem/vmstat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..976df0153e670ace516d7872efc4407839f4a0ec --- /dev/null +++ b/ops/mem/vmstat/ops-help @@ -0,0 +1 @@ +vmstat: report virtual memory,cpu,disk,process status. diff --git a/ops/mem/vmstat/ops-run b/ops/mem/vmstat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..92e4bc74c7a1585524c865abfc95823b36c9907d --- /dev/null +++ b/ops/mem/vmstat/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### # File Name : mem_scam.sh +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} + +usage="\ +Usage: + t-ops mem vmstat [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + vmstat -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} + +function main() +{ + + + if [ $# -ge 1 ]; then + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + fi + + vmstat $@ +} + +main $* diff --git a/ops/misc/tracesig/tracesig.sh b/ops/misc/tracesig/tracesig.sh new file mode 100755 index 0000000000000000000000000000000000000000..ef48e4d6c499707795675ed2cdb0d84f917aed68 --- /dev/null +++ b/ops/misc/tracesig/tracesig.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +usage="\ +Usage: + oc-ops misc tracesig [-p pid| -c comm | -s sig] -e|-d|-r + COMMAND-LINE Options: + -p, 设置要跟踪的接收信号进程pid + -s, 只跟踪某个信号 + -d, 结束跟踪 + -r, 查看信号发送信息 +" + +if (( $# < 1 )); then + echo "$usage" + exit 1 +fi + +export logdir="/data/oc-ops/misc/tracesig" + +export curr_dir=$(pwd) +export work_dir=$(readlink /proc/$$/fd/255); work_dir=$(dirname $work_dir); cd "$work_dir" +export run_log="$logdir/run_log" + +source lib.sh +set_cgroup_rights + +tracee_pid=0 +tracee_sig=0 + +check_logdir() +{ + echo "$logdir" | grep ^[/] > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Must using absolute path!"; return 1 ; fi + + echo "$logdir" | grep "misc/tracesig" > /dev/null ; ret_val=$? + if (( $ret_val != 0 )); then echo "Path must including \"misc/tracesig\" substring!"; return 1 ; fi + + echo "$logdir" | grep -E '[ | ]' > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Path must not including space and tab char!"; return 1 ; fi + + return 0 +} + +safe_rm() +{ + echo "$PWD/" | grep "$logdir" > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Forbid rm outside the $logdir dir!"; return 1 ; fi + + echo "$@" | grep "/" > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Forbid having \"/\" in args!"; return 1 ; fi + + rm $@ 2>/dev/null +} +export -f safe_rm + +mkdir -p "$logdir" + +filter="--filter" +get_filter() +{ + if (( $tracee_pid != 0 )); then + if (( $tracee_sig != 0 )); then + filter=$( echo "$filter 'pid == $tracee_pid && sig == $tracee_sig'") + return + else + filter=$( echo "$filter 'pid == $tracee_pid'") + fi + fi + + if (( $tracee_sig != 0 )); then + filter=$( echo "$filter 'sig == $tracee_sig'") + fi +} + +perf_pid="" +enable_trace() +{ + check_logdir ; cd $logdir + safe_rm -f * + get_filter + perf_cmd="perf record -e signal:signal_generate $filter &" + eval $perf_cmd + perf_pid=$! + echo $perf_pid > perf_pid +} + +disable_trace() +{ + check_logdir ; cd $logdir + perf_pid=$(cat perf_pid) + kill -15 $perf_pid +} + +read_trace_result() +{ + check_logdir ; cd $logdir + perf script +} + +while getopts 'p:s:edrh' OPT; do + case $OPT in + p) tracee_pid="$OPTARG" + ;; + s) tracee_sig="$OPTARG" + ;; + d) disable_trace + exit 0 + ;; + r) read_trace_result + exit 0 + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +enable_trace diff --git a/ops/net/arp/ops-help b/ops/net/arp/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..f431e94189d423d8d707b8697763aa75f5341d3b --- /dev/null +++ b/ops/net/arp/ops-help @@ -0,0 +1 @@ +net: arp tools, see t-ops net arp -h diff --git a/ops/net/arp/ops-run b/ops/net/arp/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..54c3149c756abd46df0c0f6a2289b0e9f189c593 --- /dev/null +++ b/ops/net/arp/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} +usage="\ +Usage: + /* arp tools */ + t-ops net arp [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + arp -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} +function main() +{ + if [ $# -lt 1 ]; then + echo "$usage" + exit 1 + fi + + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + + arp $@ +} + +main $* diff --git a/ops/net/iftop/ops-help b/ops/net/iftop/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..2cc6a7cca9db21077b3312aab17a3d006f277100 --- /dev/null +++ b/ops/net/iftop/ops-help @@ -0,0 +1 @@ +net: iftop tools, report network bandwidth, see t-ops net iftop -h diff --git a/ops/net/iftop/ops-run b/ops/net/iftop/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..95df32bdfc34a333b5f1970ee0e96262b234c88d --- /dev/null +++ b/ops/net/iftop/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} +usage="\ +Usage: + /* iftop tools */ + t-ops net iftop [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + iftop -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} +function main() +{ + installed=$(strstr "rpm -qa" htop ) + if [ -z "$installed" ] ; then + yum install iftop -y + fi + + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + + iftop $@ +} + +main $* diff --git a/ops/net/ip/ops-help b/ops/net/ip/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..6394839f680b5e4ceef7c1ede19c1963dce73882 --- /dev/null +++ b/ops/net/ip/ops-help @@ -0,0 +1 @@ +net: ip tools, see t-ops net ip -h diff --git a/ops/net/ip/ops-run b/ops/net/ip/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..2397d55e770fc43291a0d068a478f36b96f0f914 --- /dev/null +++ b/ops/net/ip/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} +usage="\ +Usage: + /* ip tools */ + t-ops net ip [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + ip -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} +function main() +{ + if [ $# -lt 1 ]; then + echo "$usage" + exit 1 + fi + + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + + ip $@ +} + +main $* diff --git a/ops/net/netstat/ops-help b/ops/net/netstat/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..288ed52bd5a80a401080e0200d973826c2610be6 --- /dev/null +++ b/ops/net/netstat/ops-help @@ -0,0 +1 @@ +net: netstat tools, see t-ops net netstat -h diff --git a/ops/net/netstat/ops-run b/ops/net/netstat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..179c5d73849a7523b3073d5818e782e448b7b880 --- /dev/null +++ b/ops/net/netstat/ops-run @@ -0,0 +1,53 @@ +#!/bin/bash +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} +usage="\ +Usage: + /* netstat tools */ + t-ops net netstat [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + netstat -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} +function main() +{ + + if [ $# -lt 1 ]; then + echo "$usage" + exit 1 + fi + + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + + netstat $@ +} + +main $* diff --git a/ops/net/ops-help b/ops/net/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..49caf8e0de79421fd1d7d3a965ca3cdf8f1b9829 --- /dev/null +++ b/ops/net/ops-help @@ -0,0 +1 @@ +net tools, such as ss/netstat .etc diff --git a/ops/net/ss/ops-help b/ops/net/ss/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..d6e89881dda694a9038fd42ddece252d71dc328e --- /dev/null +++ b/ops/net/ss/ops-help @@ -0,0 +1 @@ +net: ss tools, see t-ops net ss -h diff --git a/ops/net/ss/ops-run b/ops/net/ss/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..70aeb8884030cb0f31d6ae862285541612190c48 --- /dev/null +++ b/ops/net/ss/ops-run @@ -0,0 +1,52 @@ +#!/bin/bash +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### +GREEN='\033[1;32m' +RED='\033[1;31m' +BLUE='\033[1;34m' +NC='\033[0m' + +function strstr() +{ + echo $1 | grep $2 +} +usage="\ +Usage: + /* ss tools */ + t-ops net ss [-h] +" +function help() +{ + while getopts 'h' OPT; do + case $OPT in + h) echo "$usage" + ss -h + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac + done +} +function main() +{ + if [ $# -lt 1 ]; then + echo "$usage" + exit 1 + fi + + ret=$(strstr $1 "\-h") + if [ ! -z "$ret" ]; then + help $* + exit 1 + fi + + ss $@ +} + +main $* diff --git a/ops/os_stat/ops-help b/ops/os_stat/ops-help new file mode 100755 index 0000000000000000000000000000000000000000..4a0281709043bcda9012f8b90d1dceed9fa5976f --- /dev/null +++ b/ops/os_stat/ops-help @@ -0,0 +1 @@ +stat kernel performance: show detail information by" oc-ops os_stat -h" diff --git a/ops/os_stat/ops-run b/ops/os_stat/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..945731442f6b6f49be5b21368b0530378bf6048a --- /dev/null +++ b/ops/os_stat/ops-run @@ -0,0 +1,478 @@ +#!/bin/sh +function strstr() +{ + echo $1 | grep $2 +} + +function is_pub() +{ + res=$(strstr $(uname -r) "0009") +} + +function include_link() +{ + res=$(strstr $(uname -r) "0009") + res_3_10=$(strstr $(uname -r) "3.10") + res_4_14=$(strstr $(uname -r) "4.14") + res_5_4=$(strstr $(uname -r) "5.4.203") + if [ -z "$res_5_4" ]; then + res_5_4=$(strstr $(uname -r) "5.4.119") + fi + res_5_4_new=$(strstr $(uname -r) "5.4.241-1") + res_5_4_arm=$(strstr $(uname -r) "5.4.241-24") + res_6_6=$(strstr $(uname -r) "6.6") + if [ ! -z "$res" ]; then + rm include + ln -sf include_pub include + elif [ ! -z "$res_5_4" ] || [ ! -z "$res_5_4_new" ]; then + rm include + ln -sf include_private include + elif [ ! -z "$res_6_6" ]; then + rm include + ln -sf include_6_6 include + elif [ ! -z "$res_4_14" ]; then + rm include + ln -sf include_tk3 include + elif [ ! -z "$res_5_4_arm" ]; then + rm include + ln -sf include_tk4_arm include + elif [ ! -z "$res_3_10" ]; then + rm include + ln -sf include_tk2 include + fi +} + +function helper() +{ + echo -f, --function: "which scan from this function", more, uses -f f1 -1 f2 -2 f3 ... -40 f40 .etc + echo -f, --from: "monitor latency between two function, from f1 to f2" + echo -t, --to: "monitor latency between two function, from f1 to f2" + echo -fg, --flags: "0:test hot path, 1: test one function, 2: test all syscall performance and throughput, 3: parameter optimize;" + echo -ht, --hot: "only -fg 0/hot path case: first scan hot path, -ht 1, later -ht 0. other case -fg: no need assign -ht" + echo -d, --dir: "only -fg 0 need assign -d: linux kernel source dir, uses server os kernel source code, or open source whoes version same as server os" + echo -de, --delay: "duration, per function scan, default 5s;" + echo -sig, --signal: "signal:(2, SIGRTMAX)" + echo -sc, --scene: "support scene, reuse same var with signal:(SIGRTMAX, )" + echo -b, --block: "0: no block, 1: block" + echo -o, --ondemand: "0:scan function ondemand; 1:scan function from all kernel dir; default 0" + echo -ft, --fast: "0: trace one function once time, 1: trace all all sub function of parent function once time, default 1" + echo -p, --proc: "only track the process assigned by proc, default: no proc assigned" + echo -s, --sample: "sample rate for monitor, such as: -s 100, sample 1 per 100" + echo -h, --help: "help information" + echo such as: + echo \(1\)test hotpath\(start form vfs_read, 1st time\): + echo -e "\033[32m t-ops os_stat -d /data/tkernel4\(or opensource /data/linux, version is the same to trace\) -f vfs_read -fg 0 -ht 1 -de 5 -o 0 -ft 1 \033[0m" + echo \(2\)test hotpath\(start form vfs_read, 2rd time\): + echo -e "\033[32m t-ops os_stat -d /data/tkernel4 -f vfs_read -fg 0 -ht 0 -de 5 -o 0 -ft 1 \033[0m" + echo \(3\)test one function\(vfs_read performance\): + echo -e "\033[32m t-ops os_stat -f vfs_read -fg 1 -de 5 -o 0 \033[0m" + echo \(4\)test all system performance: + echo -e "\033[32m t-ops os_stat -fg 2 -de 1 \033[0m" + echo \(5\)test parameter optimise: + echo -e "\033[32m t-ops os_stat -fg 3 -de 5 \033[0m" + echo \(6\)show parameter of kernel function see help: + echo -e "\033[32m t-ops os_stat -fg 5 -h \033[0m" + echo \(7\)debug user function see help: + echo -e "\033[32m t-ops os_stat -fg 6 -h \033[0m" + echo \(8\)performance monitor, get latency of each function, now support 5 function, uses -f/-1/-2/-3/-4/-5: + echo -e "\033[32m t-ops os_stat -f vfs_read -1 submit_bio -s 100 -fg 7 -de 5 -o 0 \033[0m" + echo \(9\)performance monitor, get latency between two function, such as from \"-f vfs_read\" to \"-t submit_bio\" + echo -e "\033[32m t-ops os_stat -f vfs_read -t submit_bio -s 100 -fg 8 -de 5 -o 0 \033[0m" +} +function main() +{ + POSITIONAL=() + num=0 + + PERFORMANCE="1" + PROC="0" + DELAY="5" + BLOCK="0" + FLAGS=0 + TOTALNUM=0 + FUNCTION={} + SAMPLE=1 + SIGNAL=9 + START_FUNCTION="" + END_FUNCTION="" + VAR_VAL=0 + + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -d|--dir) + DIR="$2" + shift # past argument + shift # past value + ;; + -t|--to) + END_FUNCTION="$2" + shift # past argument + shift # past value + ;; + -f|--function|--from) + FUNCTION1="$2" + START_FUNCTION="$2" + FUNCTION[num]="-f $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -1|--function1) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -2|--function2) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -3|--function3) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -4|--function4) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -5|--function5) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + shift # past argument + shift # past value + ;; + -6|--function6) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -7|--function7) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -8|--function8) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -9|--function9) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -10|--function10) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -11|--function11) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -12|--function12) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -13|--function13__sk_destruct) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -6|--function6) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -7|--function7) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -8|--function8) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -9|--function9) + FUNCTION[num]="-$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -10|--function10) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -11|--function11) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -12|--function12) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -13|--function13__sk_destruct) + FUNCTION[num]="--$num $2" + TOTALNUM=$(($TOTALNUM+1)) + num=$(($num+1)) + shift # past argument + shift # past value + ;; + -s|--sample) + SAMPLE=$2 + shift # past argument + shift # past value + ;; + -sig|--signal) + SIGNAL=$2 + shift # past argument + shift # past value + ;; + -var|--variable) + VAR_VAL=$2 + shift # past argument + shift # past value + ;; + -fg|--flags) + FLAGS=$2 + shift # past argument + shift # past value + ;; + -ht|--hot) + HOT="$2" + shift # past argument + shift # past value + ;; + -de|--delay) + DELAY="$2" + shift # past argument + shift # past value + ;; + -b|--block) + BLOCK="$2" + shift # past argument + shift # past value + ;; + -o|--ondemand) + ONDEMAND="$2" + shift # past argument + shift # past value + ;; + -ft|--fast) + PERFORMANCE="$2" + shift # past argument + shift # past value + ;; + -p|--proc) + PROC="$2" + shift # past argument + shift # past value + ;; + -n|--num) + TOTALNUM="$2" + shift # past argument + shift # past value + ;; + -h|--help) + helper + return + shift # past argument + ;; + *) + POSITIONAL+=("$1") # save it in an array for later + shift # past argument + ;; + esac + num=$(($num+1)) + if [ $FLAGS -ge 5 ] && [ $FLAGS -lt 7 ]; then + break + fi + done + + file_dir=$(dirname $0) + if [ $FLAGS -eq 11 ]; then + #get data only + #PRINT= 32 + 2 + ; start from 35 + cd $file_dir/os_stat_user + ./os_stat_data; + return + fi + if [ $FLAGS -eq 12 ]; then + #get data only + #PRINT= 32 + 2 + ; start from 35 + cd $file_dir/os_stat_user + ./os_stat_test; + return + fi + #check paremter counts + if [[ $num -lt 2 && $# -lt 2 ]]; then + if [ $FLAGS == 5 ]; then + cd $file_dir/os_stat_show_parameter + ./compute.sh $* + return + fi + if [ $FLAGS == 6 ]; then + cd $file_dir/os_stat_uprobe + ./uprobe_trace.sh $* + return + fi + helper + return + fi + + cd /usr/lib/opencloudos-tools/ops/os_stat/os_stat/ + + #soft link include, pointer to kernel + include_link + + #make modules + res_6_6=$(strstr $(uname -r) "6.6") + if [ ! -z "$res_6_6" ]; then + make -C /usr/src/kernels/$(uname -r) M=`pwd` modules version=6.6 + else : + make -C /usr/src/kernels/$(uname -r) M=`pwd` modules version=0 + fi + mv os_aware.ko ../os_stat_user/ + + #enter user dir + cd $file_dir/os_stat_user + + #do the test + res=$(is_pub) + if [ ! -z "$res" ]; then + module=os_aware.ko + else : + tk4=$(strstr $(uname -r) "5.4") + if [ ! -z "$tk4" ]; then + tk4=${tk4:0:22} + module=os_aware_$tk4.ko.xz + elif [ -f "os_aware_$(uname -r).ko.xz" ] ; then + module=os_aware_$(uname -r).ko.xz + fi + fi + + if [ ! -z "$res" ] || [ ! -f "$module" ]; then + cd $file_dir/os_stat/ + #soft link include, pointer to kernel + include_link + + #make modules + res_6_6=$(strstr $(uname -r) "6.6") + if [ ! -z "$res_6_6" ]; then + make -C /usr/src/kernels/$(uname -r) M=`pwd` modules version=6.6 + else : + make -C /usr/src/kernels/$(uname -r) M=`pwd` modules version=0 + fi + if [ $FLAGS -lt 5 ] || [ $FLAGS -ge 7 ]; then + mv os_aware.ko ../os_stat_user/ + else : + mv os_aware.ko ../os_stat_show_parameter/ + fi + fi + + if [ $FLAGS != 5 ]; then + #enter user dir + cd $file_dir/os_stat_user + make + make os_stat_data + make os_stat_test + make os_stat_scene_nohook + else : + cd $file_dir/os_stat_show_parameter + fi + if [ ! -f "$file_dir/os_stat_user/$module" ] ; then + module=os_aware.ko + fi + + if [ $FLAGS -eq 0 ]; then + if [ ! -z "$ONDEMAND" ]; then + ./stat.sh $module $DIR $FUNCTION1 $HOT $DELAY $ONDEMAND 1 $PERFORMANCE $PROC + else : + ./stat.sh $module $DIR $FUNCTION1 $HOT $DELAY 0 1 $PERFORMANCE $PROC + fi + else : + insmod $file_dir/os_stat_user/$module + if [ $FLAGS -eq 1 ]; then + IFS=,; for SUBFUNCTION in $FUNCTION1; do + ./os_stat_blongm -f $SUBFUNCTION -1 tmp -2 tmp -n 1 -h 0 --de $DELAY -i 0 -b $BLOCK -x; + done + elif [ $FLAGS -eq 2 ] ; then + ./os_stat_blongm -f tmp -1 tmp -1 tmp -i 1 --de $DELAY + + elif [ $FLAGS -eq 3 ] ; then + cd $file_dir/os_stat_paremter + ./paremter_main.py $DELAY + elif [ $FLAGS -eq 5 ] ; then + #echo $* $PWD + ./compute.sh $* + elif [ $FLAGS -eq 6 ] ; then + cd $file_dir/os_stat_uprobe + ./uprobe_trace.sh $* + elif [ $FLAGS -eq 7 ] ; then + if [ $SAMPLE -lt 1 ]; then + SAMPLE=1 + fi + ./os_stat_blongm ${FUNCTION[@]} -n $TOTALNUM --ptr 0 -s $SAMPLE --de $DELAY --var $VAR_VAL -i 0 -b $BLOCK -x; + elif [ $FLAGS -eq 8 ] ; then + ./os_stat_blongm ${FUNCTION[@]} -n $TOTALNUM --ptr 0 -s $SAMPLE --de $DELAY -i 2 -b $BLOCK -x; + elif [ $FLAGS -eq 9 ]; then + PRINT=$(($SIGNAL+2)) + ./os_stat_blongm ${FUNCTION[@]} -n $TOTALNUM --ptr 0 --ht 0 --de $DELAY --sc $PRINT -b $BLOCK -x; + elif [ $FLAGS -eq 10 ]; then + #PRINT= 32 + 2 + ; start from 35 + echo ${FUNCTION[@]},,,,, $TOTALNUM + ./os_stat_blongm ${FUNCTION[@]} -n $TOTALNUM --ptr 0 --ht 0 --de $DELAY --sc $PRINT -b $BLOCK -x; + elif [ $FLAGS -eq 13 ]; then + #PRINT= 32 + 2 + ; start from 35 + ./os_stat_scene_nohook ${FUNCTION[@]} -n $TOTALNUM --sc $PRINT -x; + else : + echo -e "\033[32m -fg error, not support flag\033[0m" + fi + rmmod os_aware + fi + +} + +main $* diff --git a/ops/os_stat/os_stat/Makefile b/ops/os_stat/os_stat/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8a03c737620acf38b93a0cc035bf239b27df95b1 --- /dev/null +++ b/ops/os_stat/os_stat/Makefile @@ -0,0 +1,35 @@ +# +# Makefile for sys module +# +#$(info ${version}) +obj-m += os_aware.o + +os_aware-y := main.o data_aware.o syms.o hook_tk5.o hook.o kretprobe_prehook.o kprobe_prehook.o sysctl.o ftrace_hook.o func_pointer_table.o func_struct_table.o scene_layer.o io_scene/io_scene.o mm_scene/memory_scene.o store_data.o scene_template.o catch_signal.o parse_paramter.o base_function/irq.o +ifeq (${version}, 6.6) +os_aware-y +=func_pointer_table_6_6.o +else +os_aware-y += func_pointer_table_5_4.o +endif +ifneq (${version}, 3.10) +os_aware-y += net_scene/slub_scan.o io_scene/io_bfq_scene.o net_scene/net_scene.o base_lib.o +else +os_aware-y += base_lib_tk2.o +endif + +KERNELDIR=`pwd`/include +MODULEDIR=`pwd` +KERNEL_VERSION := $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) +KBUILD_CFLAGS+=-I$(PWD)/include/mm +KBUILD_CFLAGS+=-I$(PWD)/include/fs/ext4/ +KBUILD_CFLAGS+=-I$(PWD)/include/fs/ext4_new/ +ifeq ($(VERSION).$(PATCHLEVEL).$(SUBLEVEL), 5.4.203) +KBUILD_CFLAGS+=-I$(PWD)/include/mm/mm_le_0011 +else ifeq ($(VERSION).$(PATCHLEVEL).$(SUBLEVEL), 5.4.119) +KBUILD_CFLAGS+=-I$(PWD)/include/fs/ext4_old/ +KBUILD_CFLAGS+=-I$(PWD)/include/mm/mm_le_0011 +endif + +default: + /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) modules +clean: + rm *.o *.ko .data* .func* .hook* io_scene/*.o io_scene/.io* .os* .syms* .sys* .ftrace* .kprobe* .main* *mod.c .kret* .net* .scene* net_scene/*.o net_scene/.net* net_scene/.slub* mm_scene/*.o mm_scene/.memory_scene* .store* os_aware.mod .debug_scene.* .catch_signal* .base_lib.* .parse_paramter.* base_function/*.o base_function/.irq* diff --git a/ops/os_stat/os_stat/README b/ops/os_stat/os_stat/README new file mode 100644 index 0000000000000000000000000000000000000000..365942857a00da04a72ad0e65ba2585c251f6968 --- /dev/null +++ b/ops/os_stat/os_stat/README @@ -0,0 +1,22 @@ +Use ftrace/kretprobe/kprobe to hook function +and stat the performance data which is show by sysctl. + +do: +1.debug,hook function, print function infomation; +2.check performance: + (1)one function, echo function > /proc/sys/kprobe_register_func to stat latency, + and cat /proc/sys/func_data to get latency data. + (2)work with os_stat_uer to scan hot function path automaticly. + +such as: +at /proc/sys/os_aware/: +#all_data: show 10 of the max lantecy contain syscall/page fault/mem/numa/process num. + echo 0 > all_data; dmesg; +#data:show 10 of the max lantecy about syscall/page fault; +#func_data: show unction data. + + + + + + diff --git a/ops/os_stat/os_stat/base_function/irq.c b/ops/os_stat/os_stat/base_function/irq.c new file mode 100644 index 0000000000000000000000000000000000000000..f300dae3ee9e9a5a5e23facb227cbc7b8634ccb3 --- /dev/null +++ b/ops/os_stat/os_stat/base_function/irq.c @@ -0,0 +1,87 @@ +/* + * stat function performance + * aurelianliu@tencent.com + */ +#include "../scene_layer.h" +#include "../syms.h" + +static bool work_start; +static unsigned long *stat_irq_status; +static unsigned long stat_err_irq; +void stat_irq(struct work_struct *work); +static DECLARE_DELAYED_WORK(stat_irq_work, stat_irq); + +void stat_irq(struct work_struct *work) +{ + int cpu; + unsigned long time, cycle = 0; + + for_each_possible_cpu(cpu) { + struct kernel_cpustat *kcs = &kcpustat_cpu(cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + struct vm_event_state *vm_stat = &per_cpu(vm_event_states, cpu); +#endif + time = kcs->cpustat[CPUTIME_IRQ]; +#ifdef CONFIG_X86_64 + cycle = kstat_cpu_irqs_sum(cpu) + stat_arch_irq_stat_cpu(cpu); +#endif + store_info(cpu, time - stat_irq_status[cpu], cycle - stat_irq_status[NR_CPUS + cpu], + "stat:cpu:", "irq time:", "stat cycles"); + stat_irq_status[cpu] = time; + stat_irq_status[NR_CPUS + cpu] = cycle; + } +#ifdef CONFIG_X86_64 + cycle = stat_arch_irq_stat(); + store_info(0, 0, cycle - stat_err_irq, "stat:err count:", "irq:", "stat cycles:"); + stat_err_irq = cycle; +#endif + + if (sysctl_module_enable_irq == 1) + schedule_delayed_work(&stat_irq_work, 100); +} + +int sysctl_irq_enable_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + int ret, cpu; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + if (sysctl_module_enable_irq == 1) { + + if (work_start) + return ret; + + if (!stat_irq_status) + stat_irq_status = (unsigned long *)vzalloc(2 * NR_CPUS * sizeof(unsigned long)); + if (!stat_irq_status) { + pr_err("alloc stat_irq_status failed\n"); + return ret; + } + for_each_possible_cpu(cpu) { + struct kernel_cpustat *kcs = &kcpustat_cpu(cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + struct vm_event_state *vm_stat = &per_cpu(vm_event_states, cpu); +#endif + stat_irq_status[cpu] = kcs->cpustat[CPUTIME_IRQ]; +#ifdef CONFIG_X86_64 + stat_irq_status[NR_CPUS + cpu] = kstat_cpu_irqs_sum(cpu) + stat_arch_irq_stat_cpu(cpu); +#endif + } +#ifdef CONFIG_X86_64 + stat_err_irq = stat_arch_irq_stat(); +#endif + + work_start = true; + schedule_delayed_work(&stat_irq_work, 100); + } + + return ret; +} diff --git a/ops/os_stat/os_stat/base_lib.c b/ops/os_stat/os_stat/base_lib.c new file mode 100644 index 0000000000000000000000000000000000000000..299046b73df171f5943d1e28544495d453b160e1 --- /dev/null +++ b/ops/os_stat/os_stat/base_lib.c @@ -0,0 +1,843 @@ +/* + * base functions + * aurelianliu@tencent.com + * part I. lightweight functions + * part II. buffer alloction + * part III.no crash support + * part IV. performance support + */ + +#include "data_aware.h" +#include "scene_layer.h" +#include "hook.h" + +/* + * part I. + * lightweight functions + */ +#ifdef CONFIG_ARM64 +unsigned long rdtsc(void) +{ + unsigned long count_num; + __asm__ __volatile__ ("mrs %0, cntvct_el0" : "=r" (count_num)); + return count_num; +} +#endif + +/* + * part II. + * buffer alloction + */ + +unsigned long stat_one_func_size_new = 0; +unsigned long stat_one_func_item_size = 0; +unsigned long *stat_one_func_time = NULL; +unsigned long stat_one_func_size = 0; +/* stat_one_func_time has 5 parts, 2 * start time, total time, block time, + * call num and pid + * each part has nr_cpu_ids * ENTRY_TIMES_PERCP items + * start time * 2: record func start time + * total time: record func total time + * block time: record func total time, which has block time + * call num: record func call num + * pid: record proc pid in one cpu, to avoid multi process call conflict + * stat_one_func_time layout: + * |start time|start time|total time|block time|call num|pid|... + */ +#define ENTRY_TIMES_PERCPU 6 +#define STAT_HAS_PART 5 +/* recored func start time mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_start_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +unsigned long stat_one_func_start_2_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +/* recored total time mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_total_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +unsigned long stat_one_func_block_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +/* recored func call mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_num_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +struct address_range func_performance_range; +unsigned long stat_func_block_time[HOOK_FUNC_NUM]; +/* recored each cpu entry index, to find addr in stat_one_func_time */ +unsigned long stat_percpu_entry_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +unsigned long stat_percpu_index_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +unsigned long stat_percpu_reentry_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +int alloc_buffer_for_stat(int hook_count) +{ + int i, j; + + stat_one_func_item_size = nr_cpu_ids * sizeof(unsigned long) * ENTRY_TIMES_PERCPU; + stat_one_func_size_new = (hook_count + 1) * STAT_HAS_PART * stat_one_func_item_size; + if ((hook_count > 0) && (stat_one_func_time == NULL || stat_one_func_size_new > stat_one_func_size)) { + stat_one_func_size = stat_one_func_size_new; + if (stat_one_func_time) + vfree(stat_one_func_time); + + stat_one_func_time = (unsigned long *)vzalloc(stat_one_func_size); + pr_err("alloc_buffer_for_stat %lx, item:%d\n", stat_one_func_time, (hook_count + 1) * STAT_HAS_PART * ENTRY_TIMES_PERCPU * nr_cpu_ids); + if (!stat_one_func_time) { + pr_err("alloc_buffer_for_stat failed\n"); + return -ENOMEM; + } + + func_performance_range.start = (unsigned long)stat_one_func_time; + func_performance_range.end = (unsigned long)stat_one_func_time + stat_one_func_size;; + + for (i = 0; i <= hook_count; i++) { + for (j = 0; j < ENTRY_TIMES_PERCPU; j++) { + /* each func in which array index of stat_one_func_time */ + /* each func uses one item + * func0:0: start time index of one func(one func is one index: func parameter:"index") + * func0:nr_cpu_ids: total time index + * func0:nr_cpu_ids*2: call num index + * func1:1: start time index of one func(one func is one index: func parameter:"index") + * func1:nr_cpu_ids + 1: total time index + * func1:nr_cpu_ids*2 + 1: call num index*/ + stat_one_func_start_array[i][j] = ((i + j) * STAT_HAS_PART) * nr_cpu_ids; + stat_one_func_start_2_array[i][j] = stat_one_func_start_array[i][j] + nr_cpu_ids; + stat_one_func_total_array[i][j] = stat_one_func_start_2_array[i][j] + nr_cpu_ids; + stat_one_func_block_array[i][j] = stat_one_func_total_array[i][j] + nr_cpu_ids; + stat_one_func_num_array[i][j] = stat_one_func_block_array[i][j] + nr_cpu_ids; + } + } + } + + return 0; +} + +void vfree_buffer(void) +{ + if (stat_one_func_time) + vfree(stat_one_func_time); + stat_one_func_time = NULL; +} + +inline bool is_in_range(unsigned long addr, char *func, int line) +{ + bool ret; + + ret = (addr >= func_performance_range.start && addr < func_performance_range.end); + + if (!ret) { + sysctl_module_enable = 0; + unload_disable_module(); + pr_err("is_in_range, Error address in func:%s, line:%d", func, line); + } + + return ret; +} +enum { + CONTEXT_IN_KERNEL = 1, + CONTEXT_IN_IRQ = 2, +}; + + +/* + * set the no crash range + */ +void context_check_start(void) +{ + current->kabi_reserved1 = CONTEXT_IN_KERNEL; + if (in_interrupt()) + current->kabi_reserved1 = CONTEXT_IN_IRQ; +} + +void context_check_end(void) +{ + current->kabi_reserved1 = 0; + goto out; + + volatile int i = 0; + volatile int j = i; + volatile int k = j; + volatile int m = k; +out: + return; +} + +bool context_check(void) +{ + percpu_counter_inc(&ftrace_patch_num); + + if (!module_is_enable()) + return true; + + context_check_start(); + + return false; +} + +void context_exit(void) +{ + context_check_end(); + percpu_counter_dec(&ftrace_patch_num); +} + +/* + * process each crash case + */ +void stat_do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_kern_addr_fault(regs, hw_error_code, address); + + exit_hook_special(); +} +void stat_no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_no_context(regs, error_code, address, signal, si_code); + + exit_hook_special(); +} +void stat_do_divide_error(struct pt_regs *regs, long error_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_divide_error(regs, error_code); + + exit_hook_special(); +} +void stat_do_general_protection(struct pt_regs *regs, long error_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_general_protection(regs, error_code); + + exit_hook_special(); +} + +/* + * process memory trampling or memory corruption + */ + + +/* + * part IV. + * performance support + */ +int find_entry_index(int cpu, int index, bool enter) +{ + int j, k = -1; + for(j = 0; j < ENTRY_TIMES_PERCPU; j++) { + if (stat_percpu_entry_array[cpu][j] == current->pid && stat_percpu_index_array[cpu][j] == index) { + if(enter) { + current->total_numa_faults++; + return -1; + } + if (--current->total_numa_faults > 0) { + return -1; + } + stat_percpu_entry_array[cpu][j] = 0; + stat_percpu_index_array[cpu][j] = 0; + return j; + } + if (stat_percpu_entry_array[cpu][j] == 0) + k = j; + } + + if (!enter) + return -1; + + if (k >= 0) { + stat_percpu_entry_array[cpu][k] = current->pid; + stat_percpu_index_array[cpu][k] = index; + current->total_numa_faults = 1; + return k; + } + + return -1; +} +/* + * to stat more function performance + * enter function: + */ +void save_start_time(int index) +{ + int i, j, k = -1; + int cpu = smp_processor_id(); + + if (index < 1 || !module_is_enable()) + goto out; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + goto out; + + j = find_entry_index(cpu, index, true); + if (j < 0) + goto out; + + i = stat_one_func_start_array[index][j]; + k = stat_one_func_start_2_array[index][j]; + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = rdtsc(); + if (is_in_range((unsigned long)&stat_one_func_time[k + cpu], __func__, __LINE__)) + stat_one_func_time[k + cpu] = stat_one_func_time[i + cpu]; + current->numa_migrate_retry |= 1 << index; +out: + return; +} + +/* + * exit function: + */ +void save_total_time(int index, bool this) +{ + int i, j, k, n, m; + bool recored = false; + int cpu = smp_processor_id(), other_cpu; + + if (index < 1 || !module_is_enable()) + goto out; + + if (this && sysctl_trace_type == 1) + goto out; + + if (sysctl_trace_type == 1) { + index += 1; + this = false; + } + + if (register_ftrace_ftrace != FTRACE_REGISTER) + goto out; + + k = find_entry_index(cpu, index, false); + if (k < 0) + goto out; + + i = stat_one_func_start_array[index][k]; + j = stat_one_func_total_array[index][k]; + n = stat_one_func_start_2_array[index][k]; + m = stat_one_func_block_array[index][k]; + if (stat_one_func_time[i + cpu] > 0) { + recored = true; + if (is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + cpu]; + if (is_in_range((unsigned long)&stat_one_func_time[m + cpu], __func__, __LINE__)) + stat_one_func_time[m + cpu] += rdtsc() - stat_one_func_time[n + cpu]; + } + else if (!this) { + for_each_possible_cpu(other_cpu) { + if (stat_one_func_time[i + other_cpu] > 0) { + recored = true; + if (is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + other_cpu]; + if (is_in_range((unsigned long)&stat_one_func_time[m + cpu], __func__, __LINE__)) + stat_one_func_time[m + cpu] += rdtsc() - stat_one_func_time[n + other_cpu]; + break; + } + } + } + if (recored) { + i = stat_one_func_num_array[index][k]; + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu]++; + } + current->numa_migrate_retry ^= 1 << index; +out: + return; +} +/* + * process block time + */ +void save_sched_in(void) +{ + int i, bit, k; + int cpu = smp_processor_id(); + unsigned long time; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + return; + + if (context_check()) + goto out; + if (hook_count <= 0 || !stat_one_func_time) + goto out; + + time = rdtsc(); + for_each_set_bit(bit, ¤t->numa_migrate_retry, hook_count) { + k = find_entry_index(cpu, bit, true); + if (k < 0) + goto out; + + i = stat_one_func_start_array[bit][k]; + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = time; + } +out: + context_exit(); + return; +} +void save_sched_out(void) +{ + int i, j, bit, k; + int cpu = smp_processor_id(), other_cpu; + unsigned long time; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + return; + + if (context_check()) + goto out; + if (hook_count <= 0 || !stat_one_func_time) + goto out; + + time = rdtsc(); + for_each_set_bit(bit, ¤t->numa_migrate_retry, hook_count) { + k = find_entry_index(cpu, bit, false); + if (k < 0) + goto out; + i = stat_one_func_start_array[bit][k]; + j = stat_one_func_total_array[bit][k]; + if (time > stat_one_func_time[i + cpu] && stat_one_func_time[i + cpu] != 0 + && is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += time - stat_one_func_time[i + cpu]; + else if (sysctl_trace_type == 1) { + for_each_possible_cpu(other_cpu) { + if (stat_one_func_time[i + other_cpu] > 0 + && is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) { + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + other_cpu]; + break; + } + } + } + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = 0; + } +out: + context_exit(); + return; +} +/* + * compute total time + */ +void stat_total_time(void) +{ + int i, j, k, n, m, cpu; + + if (hook_count <= 0) + return; + + for(i = 1; i <= hook_count; i++) { + stat_func_total_time[i] = 0; + stat_func_total_num[i] = 0; + stat_func_block_time[i] = 0; + for(n = 0; n < ENTRY_TIMES_PERCPU; n++) { + j = stat_one_func_total_array[i][n]; + k = stat_one_func_num_array[i][n]; + m = stat_one_func_block_array[i][n]; + for_each_possible_cpu(cpu) { + stat_func_total_time[i] += stat_one_func_time[j + cpu]; + stat_func_total_num[i] += stat_one_func_time[k + cpu]; + stat_func_block_time[i] += stat_one_func_time[m + cpu]; + } + } + if (stat_func_total_num[i]) { + stat_func_total_time[i] /= stat_func_total_num[i]; + stat_func_block_time[i] /= stat_func_total_num[i]; + } + } +} +/* + * enter scheduer or interrupt + */ +void stat_time_start(void) +{ + unsigned long type, time; + unsigned long nr = get_sys_nr(); + int cpu = smp_processor_id(); + + if (!module_is_enable()) + goto out; + + if (sysctl_module_block_enable) + goto out; + + if (strstr(current->comm, "swapper")) + goto not_stat; + + type = get_func_type(); + time = rdtsc(); + if ((type & STAT_SYSCALL_TYPE) != 0 && nr < NR_syscalls && current->numa_faults_locality[0] > 0) + current->numa_faults_locality[1] += time - current->numa_faults_locality[0]; + if ((type & STAT_FUNC_TYPE) != 0 && nr < NR_syscalls && current->numa_faults_locality[1] > 0) { + current->numa_faults_locality[0] += time - current->numa_faults_locality[1]; + save_sched_out(); + } + if ((type & STAT_PAGEFAULT_TYPE) != 0) + current->numa_faults_locality[1] += time - current->numa_faults_locality[0]; + + +not_stat: + if ((type & STAT_FUNC_TYPE) == 0) + current->numa_faults_locality[0] = 0; + else + current->numa_faults_locality[1] = 0; +out: + return; +} +/* + * exit scheduer or interrupt + */ +void stat_time_finish(void) +{ + unsigned long type; + + if (sysctl_module_block_enable) + goto out; + + type = get_func_type(); + if ((type & STAT_FUNC_TYPE) == 0) { + current->numa_faults_locality[0] = rdtsc(); + save_sched_in(); + } + else + current->numa_faults_locality[1] = rdtsc(); +out: + + return; +} + +/* + * stat syscall + */ +void set_sys_nr(unsigned long nr) +{ + current->numa_faults_locality[2] = nr << 3; +} +unsigned long get_sys_nr(void) +{ + return current->numa_faults_locality[2] >> 3; +} +void set_func_type(int pos) +{ + current->numa_faults_locality[2] |= pos; +} +unsigned long get_func_type(void) +{ + return current->numa_faults_locality[2] & 0x07; +} +void clr_func_type(int pos) +{ + current->numa_faults_locality[2] ^= pos; +} + +void stat_stat_syscall_enter(struct kret_data *data) +{ + set_func_type(STAT_SYSCALL_TYPE); + current->numa_faults_locality[0] = rdtsc(); + current->numa_faults_locality[1] = 0; + current->numa_pages_migrated = current->numa_faults_locality[0]; + + return; +} + +void stat_stat_syscall_exit(unsigned long nr, struct kret_data *data) +{ + int cpu = smp_processor_id(); + unsigned long nr_tmp = nr, time2; + + time2 = rdtsc(); + if (module_is_enable() && nr_tmp < NR_syscalls) { + nr_tmp = array_index_nospec(nr, NR_syscalls); + if (nr_tmp < NR_syscalls + && current->numa_faults_locality[0] >= current->numa_pages_migrated + && current->numa_faults_locality[1] < time2 - current->numa_pages_migrated) { + if (!is_in_range_v((unsigned long)(&stat_sys_num[cpu][nr_tmp]), __func__, __LINE__) + || !is_in_range_v((unsigned long)(&stat_sys_time[cpu][nr_tmp]), __func__, __LINE__) + || !is_in_range_v((unsigned long)(&stat_sys_time_block[cpu][nr_tmp]), __func__, __LINE__)) + goto out; + + stat_sys_num[cpu][nr_tmp]++; + stat_sys_time[cpu][nr_tmp] += time2 - current->numa_faults_locality[0] + current->numa_faults_locality[1]; + stat_sys_time_block[cpu][nr_tmp] += time2 - current->numa_pages_migrated; + } +out: + current->numa_faults_locality[0] = 0; + current->numa_faults_locality[2] = 0; + } + + return; +} +/* + * stat normal function + * */ +void stat_func_enter(struct kret_data *data) +{ + if (current->numa_pages_migrated == 0) { + current->total_numa_faults = 0; + clr_func_type(STAT_FUNC_TYPE); + } + current->total_numa_faults++; + if (current->total_numa_faults > 1) + return; + + set_func_type(STAT_FUNC_TYPE); + current->numa_pages_migrated = rdtsc(); + current->numa_faults_locality[1] = current->numa_pages_migrated; + current->numa_faults_locality[0] = 0; + + return; +} +void stat_func_exit(struct kret_data *data) +{ + int cpu = smp_processor_id(); + unsigned long time2; + + current->total_numa_faults--; + if (current->total_numa_faults > 0) + return; + + time2 = rdtsc(); + if (module_is_enable() && stat_sys_num) { + if (current->numa_faults_locality[1] > 0 && time2 >= current->numa_faults_locality[1] + && current->numa_faults_locality[1] >= current->numa_pages_migrated + && current->numa_faults_locality[0] < (time2 - current->numa_pages_migrated)) { + stat_func_num[cpu]++; + stat_func_time[cpu] += time2 - current->numa_faults_locality[1] + current->numa_faults_locality[0]; + stat_func_time_block[cpu] += time2 - current->numa_pages_migrated; + } + clr_func_type(STAT_FUNC_TYPE); + current->numa_faults_locality[1] = 0; + current->numa_faults_locality[0] = 0; + } + + return; +} + +/* + * process irq time + */ +#ifdef CONFIG_ARM64 +void stat_gic_handle_irq(struct pt_regs *regs) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + gic_handle_irq(regs); + + context_check_start(); + stat_time_finish(); + + exit_hook(); +} +#endif +#ifdef CONFIG_X86_64 +#ifdef TK5 +#define IRQ_HANDLER(name) \ +void stat_##name(struct irq_desc *desc) \ +{ \ + enter_hook(); \ +\ + stat_time_start(); \ +\ + context_check_end(); \ + __##name(desc); \ +\ + context_check_start(); \ + stat_time_finish(); \ +\ + exit_hook(); \ +\ + return; \ +\ +} +IRQ_HANDLER(handle_level_irq); +IRQ_HANDLER(handle_fasteoi_irq); +IRQ_HANDLER(handle_edge_irq); +IRQ_HANDLER(handle_simple_irq); +#else +unsigned int stat_do_IRQ(struct pt_regs *regs) +{ + int ret; + enter_hook(); + + stat_time_start(); + context_check_end(); + + ret = do_IRQ(regs); + + context_check_start(); + stat_time_finish(); + + exit_hook(); + + return ret; +} +#endif +#endif + +/* + * process scheduler time + */ +/* __schedule/prepare_task_switch */ +/* support > 5.4.203: has sched_rqm_switch*/ +void stat_sched_rqm_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + sched_rqm_switch(rq, prev, next); + + context_check_start(); + exit_hook(); + return; +} + +/* support 5.4.119: has no sched_rqm_switch*/ +void stat_rcu_note_context_switch(bool preempt) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + rcu_note_context_switch(preempt); + + context_check_start(); + exit_hook(); +} +/*psi_task_switch for tk5: prev leave, stat prev time; next come, recored start time +* as, tk5 cannot hook finish task switch. +*/ +#ifdef TK5 +void stat_psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + int cpu = smp_processor_id(); + unsigned long nr = get_sys_nr(); + unsigned long type, time; + + enter_hook(); + + if (!module_is_enable()) + goto out; + + if (sysctl_module_block_enable) + goto out; + + if (strstr(prev->comm, "swapper")) + goto not_stat; + type = get_func_type(); + time = rdtsc(); + + if (prev->numa_faults_locality[1] == 0 && prev->numa_faults_locality[0] > 0) { + prev->numa_faults_locality[0] = 0; + prev->numa_faults_locality[2] = 0; + prev->total_numa_faults = 0; + } + if ((type & STAT_SYSCALL_TYPE) != 0 && nr < NR_syscalls + && prev->numa_faults_locality[0] >= prev->numa_pages_migrated + && prev->numa_faults_locality[0] > 0) { + prev->numa_faults_locality[1] += time - prev->numa_faults_locality[0]; + } + if ((type & STAT_FUNC_TYPE) != 0 && nr < NR_syscalls + && prev->numa_faults_locality[1] >= prev->numa_pages_migrated + && prev->numa_faults_locality[1] > 0) { + prev->numa_faults_locality[0] += time - prev->numa_faults_locality[1]; + save_sched_out(); + } + if ((type & STAT_PAGEFAULT_TYPE) != 0) { + time = rdtsc(); + prev->numa_faults_locality[1] += time - prev->numa_faults_locality[0]; + } + +not_stat: + if (type != 0 && ((type & STAT_FUNC_TYPE) == 0)) { + prev->numa_faults_locality[0] = 0; + next->numa_faults_locality[0] = rdtsc(); + } + else if ((type & STAT_FUNC_TYPE) != 0){ + prev->numa_faults_locality[1] = 0; + next->numa_faults_locality[1] = rdtsc(); + } + +out: + context_check_end(); + + __psi_task_switch(prev, next, sleep); + + context_check_start(); + exit_hook(); +} +#endif +struct rq *stat_finish_task_switch(struct task_struct *prev) +{ + unsigned long type; + struct rq *rq; + + enter_hook(); + context_check_end(); + + rq = finish_task_switch(prev); + + context_check_start(); + stat_time_finish(); + + exit_hook(); + + return rq; +} + diff --git a/ops/os_stat/os_stat/base_lib_tk2.c b/ops/os_stat/os_stat/base_lib_tk2.c new file mode 100644 index 0000000000000000000000000000000000000000..c4ebed5773aa4cea38c387a0f968b635335c14b0 --- /dev/null +++ b/ops/os_stat/os_stat/base_lib_tk2.c @@ -0,0 +1,914 @@ +/* + * base functions + * aurelianliu@tencent.com + * part I. lightweight functions + * part II. buffer alloction + * part III.no crash support + * part IV. performance support + */ + +#include "data_aware.h" +#include "scene_layer.h" +#include "hook.h" +#include "version.h" + +#ifdef TK2 +#ifdef CONFIG_X86_64 +unsigned long rdtsc(void) +{ + unsigned int low, high; + + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + + return low | ((u64)high) << 32; +} +#endif +#endif + +/* + * part I. + * lightweight functions + */ +#ifdef CONFIG_ARM64 +unsigned long rdtsc(void) +{ + unsigned long count_num; + __asm__ __volatile__ ("mrs %0, cntvct_el0" : "=r" (count_num)); + return count_num; +} +#endif + +/* + * part II. + * buffer alloction + */ + +unsigned long stat_one_func_size_new = 0; +unsigned long stat_one_func_item_size = 0; +unsigned long *stat_one_func_time = NULL; +unsigned long stat_one_func_size = 0; +/* stat_one_func_time has 5 parts, 2 * start time, total time, block time, + * call num and pid + * each part has nr_cpu_ids * ENTRY_TIMES_PERCP items + * start time * 2: record func start time + * total time: record func total time + * block time: record func total time, which has block time + * call num: record func call num + * pid: record proc pid in one cpu, to avoid multi process call conflict + * stat_one_func_time layout: + * |start time|start time|total time|block time|call num|pid|... + */ +#define ENTRY_TIMES_PERCPU 6 +#define STAT_HAS_PART 5 +/* recored func start time mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_start_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +unsigned long stat_one_func_start_2_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +/* recored total time mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_total_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +unsigned long stat_one_func_block_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +/* recored func call mem index, to find addr in stat_one_func_time */ +unsigned long stat_one_func_num_array[HOOK_FUNC_NUM][ENTRY_TIMES_PERCPU]; +struct address_range func_performance_range; +unsigned long stat_func_block_time[HOOK_FUNC_NUM]; +/* recored each cpu entry index, to find addr in stat_one_func_time */ +unsigned long stat_percpu_entry_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +unsigned long stat_percpu_index_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +unsigned long stat_percpu_reentry_array[NR_CPUS][ENTRY_TIMES_PERCPU]; +int alloc_buffer_for_stat(int hook_count) +{ + int i, j; + + stat_one_func_item_size = nr_cpu_ids * sizeof(unsigned long) * ENTRY_TIMES_PERCPU; + stat_one_func_size_new = (hook_count + 1) * STAT_HAS_PART * stat_one_func_item_size; + if ((hook_count > 0) && (stat_one_func_time == NULL || stat_one_func_size_new > stat_one_func_size)) { + stat_one_func_size = stat_one_func_size_new; + if (stat_one_func_time) + vfree(stat_one_func_time); + + stat_one_func_time = (unsigned long *)vzalloc(stat_one_func_size); + pr_err("alloc_buffer_for_stat %lx, item:%d\n", stat_one_func_time, (hook_count + 1) * STAT_HAS_PART * ENTRY_TIMES_PERCPU * nr_cpu_ids); + if (!stat_one_func_time) { + pr_err("alloc_buffer_for_stat failed\n"); + return -ENOMEM; + } + + func_performance_range.start = (unsigned long)stat_one_func_time; + func_performance_range.end = (unsigned long)stat_one_func_time + stat_one_func_size;; + + for (i = 0; i <= hook_count; i++) { + for (j = 0; j < ENTRY_TIMES_PERCPU; j++) { + /* each func in which array index of stat_one_func_time */ + /* each func uses one item + * func0:0: start time index of one func(one func is one index: func parameter:"index") + * func0:nr_cpu_ids: total time index + * func0:nr_cpu_ids*2: call num index + * func1:1: start time index of one func(one func is one index: func parameter:"index") + * func1:nr_cpu_ids + 1: total time index + * func1:nr_cpu_ids*2 + 1: call num index*/ + stat_one_func_start_array[i][j] = ((i + j) * STAT_HAS_PART) * nr_cpu_ids; + stat_one_func_start_2_array[i][j] = stat_one_func_start_array[i][j] + nr_cpu_ids; + stat_one_func_total_array[i][j] = stat_one_func_start_2_array[i][j] + nr_cpu_ids; + stat_one_func_block_array[i][j] = stat_one_func_total_array[i][j] + nr_cpu_ids; + stat_one_func_num_array[i][j] = stat_one_func_block_array[i][j] + nr_cpu_ids; + } + } + } + + return 0; +} + +void vfree_buffer(void) +{ + if (stat_one_func_time) + vfree(stat_one_func_time); + stat_one_func_time = NULL; +} + +inline bool is_in_range(unsigned long addr, char *func, int line) +{ + bool ret; + + ret = (addr >= func_performance_range.start && addr < func_performance_range.end); + + if (!ret) { + sysctl_module_enable = 0; + unload_disable_module(); + pr_err("is_in_range, Error address in func:%s, line:%d", func, line); + } + + return ret; +} +enum { + CONTEXT_IN_KERNEL = 1, + CONTEXT_IN_IRQ = 2, +}; + + +/* + * set the no crash range + */ +void context_check_start(void) +{ + current->kabi_reserved1 = CONTEXT_IN_KERNEL; + if (in_interrupt()) + current->kabi_reserved1 = CONTEXT_IN_IRQ; +} + +void context_check_end(void) +{ + current->kabi_reserved1 = 0; + goto out; + + volatile int i = 0; + volatile int j = i; + volatile int k = j; + volatile int m = k; +out: + return; +} + +bool context_check(void) +{ + percpu_counter_inc(&ftrace_patch_num); + + if (!module_is_enable()) + return true; + + context_check_start(); + + return false; +} + +void context_exit(void) +{ + context_check_end(); + percpu_counter_dec(&ftrace_patch_num); +} + +/* + * process each crash case + */ +void stat_do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_kern_addr_fault(regs, hw_error_code, address); + + exit_hook_special(); +} +void stat_no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_no_context(regs, error_code, address, signal, si_code); + + exit_hook_special(); +} +void stat_do_divide_error(struct pt_regs *regs, long error_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_divide_error(regs, error_code); + + exit_hook_special(); +} +void stat_do_general_protection(struct pt_regs *regs, long error_code) +{ + int cpu = smp_processor_id(); + + enter_hook_special(); + + if (in_interrupt() && current->kabi_reserved1 != CONTEXT_IN_IRQ) + goto next; + + if (current->kabi_reserved1) { + sysctl_module_enable = 0; + unload_disable_module(); + regs->ip++; + exit_hook_special(); + return; + } +next: + + test_do_general_protection(regs, error_code); + + exit_hook_special(); +} + +/* + * process memory trampling or memory corruption + */ + + +/* +* part III. +* performance support +*/ +int find_entry_index(int cpu, int index, bool enter) +{ + int j, k = -1; + for(j = 0; j < ENTRY_TIMES_PERCPU; j++) { + if (stat_percpu_entry_array[cpu][j] == current->pid && stat_percpu_index_array[cpu][j] == index) { + if(enter) { + tk2_curr_ptr->total_numa_faults++; + return -1; + } + if (--tk2_curr_ptr->total_numa_faults > 0) { + return -1; + } + stat_percpu_entry_array[cpu][j] = 0; + stat_percpu_index_array[cpu][j] = 0; + return j; + } + if (stat_percpu_entry_array[cpu][j] == 0) + k = j; + } + + if (!enter) + return -1; + + if (k >= 0) { + stat_percpu_entry_array[cpu][k] = current->pid; + stat_percpu_index_array[cpu][k] = index; + tk2_curr_ptr->total_numa_faults = 1; + return k; + } + + return -1; +} +/* + * to stat more function performance + * enter function: + */ +void save_start_time(int index) +{ + int i, j, k = -1; + int cpu = smp_processor_id(); + + if (index < 1 || !module_is_enable()) + goto out; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + goto out; + + j = find_entry_index(cpu, index, true); + if (j < 0) + goto out; + + i = stat_one_func_start_array[index][j]; + k = stat_one_func_start_2_array[index][j]; + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = rdtsc(); + if (is_in_range((unsigned long)&stat_one_func_time[k + cpu], __func__, __LINE__)) + stat_one_func_time[k + cpu] = stat_one_func_time[i + cpu]; + tk2_curr_ptr->numa_migrate_retry |= 1 << index; +out: + return; +} + +/* + * exit function: + */ +void save_total_time(int index, bool this) +{ + int i, j, k, n, m; + bool recored = false; + int cpu = smp_processor_id(), other_cpu; + + if (index < 1 || !module_is_enable()) + goto out; + + if (this && sysctl_trace_type == 1) + goto out; + + if (sysctl_trace_type == 1) { + index += 1; + this = false; + } + + if (register_ftrace_ftrace != FTRACE_REGISTER) + goto out; + + k = find_entry_index(cpu, index, false); + if (k < 0) + goto out; + + i = stat_one_func_start_array[index][k]; + j = stat_one_func_total_array[index][k]; + n = stat_one_func_start_2_array[index][k]; + m = stat_one_func_block_array[index][k]; + if (stat_one_func_time[i + cpu] > 0) { + recored = true; + if (is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + cpu]; + if (is_in_range((unsigned long)&stat_one_func_time[m + cpu], __func__, __LINE__)) + stat_one_func_time[m + cpu] += rdtsc() - stat_one_func_time[n + cpu]; + } + else if (!this) { + for_each_possible_cpu(other_cpu) { + if (stat_one_func_time[i + other_cpu] > 0) { + recored = true; + if (is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + other_cpu]; + if (is_in_range((unsigned long)&stat_one_func_time[m + cpu], __func__, __LINE__)) + stat_one_func_time[m + cpu] += rdtsc() - stat_one_func_time[n + other_cpu]; + break; + } + } + } + if (recored) { + i = stat_one_func_num_array[index][k]; + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu]++; + } + tk2_curr_ptr->numa_migrate_retry ^= 1 << index; +out: + return; +} +/* + * process block time + */ +void save_sched_in(void) +{ + int i, bit, k; + int cpu = smp_processor_id(); + unsigned long time; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + return; + + if (context_check()) + goto out; + if (hook_count <= 0 || !stat_one_func_time) + goto out; + + for_each_set_bit(bit, &tk2_curr_ptr->numa_migrate_retry, hook_count) { + k = find_entry_index(cpu, bit, true); + if (k < 0) + goto out; + + i = stat_one_func_start_array[bit][k]; + time = rdtsc(); + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = time; + } +out: + context_exit(); + return; +} +void save_sched_out(void) +{ + int i, j, bit, k; + int cpu = smp_processor_id(), other_cpu; + unsigned long time; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + return; + + if (context_check()) + goto out; + if (hook_count <= 0 || !stat_one_func_time) + goto out; + + time = rdtsc(); + for_each_set_bit(bit, &tk2_curr_ptr->numa_migrate_retry, hook_count) { + k = find_entry_index(cpu, bit, false); + if (k < 0) + goto out; + i = stat_one_func_start_array[bit][k]; + j = stat_one_func_total_array[bit][k]; + if (time > stat_one_func_time[i + cpu] && stat_one_func_time[i + cpu] != 0 + && is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) + stat_one_func_time[j + cpu] += time - stat_one_func_time[i + cpu]; + else if (sysctl_trace_type == 1) { + for_each_possible_cpu(other_cpu) { + if (stat_one_func_time[i + other_cpu] > 0 + && is_in_range((unsigned long)&stat_one_func_time[j + cpu], __func__, __LINE__)) { + stat_one_func_time[j + cpu] += rdtsc() - stat_one_func_time[i + other_cpu]; + break; + } + } + } + if (is_in_range((unsigned long)&stat_one_func_time[i + cpu], __func__, __LINE__)) + stat_one_func_time[i + cpu] = 0; + } +out: + context_exit(); + return; +} +/* + * compute total time + */ +void stat_total_time(void) +{ + int i, j, k, n, m, cpu; + + if (hook_count <= 0) + return; + + for(i = 1; i <= hook_count; i++) { + stat_func_total_time[i] = 0; + stat_func_total_num[i] = 0; + stat_func_block_time[i] = 0; + for(n = 0; n < ENTRY_TIMES_PERCPU; n++) { + j = stat_one_func_total_array[i][n]; + k = stat_one_func_num_array[i][n]; + m = stat_one_func_block_array[i][n]; + for_each_possible_cpu(cpu) { + stat_func_total_time[i] += stat_one_func_time[j + cpu]; + stat_func_total_num[i] += stat_one_func_time[k + cpu]; + stat_func_block_time[i] += stat_one_func_time[m + cpu]; + } + } + if (stat_func_total_num[i]) { + stat_func_total_time[i] /= stat_func_total_num[i]; + stat_func_block_time[i] /= stat_func_total_num[i]; + } + } +} +/* + * enter scheduer or interrupt + */ +void stat_time_start(void) +{ + unsigned long type, time; + unsigned long nr = get_sys_nr(); + int cpu = smp_processor_id(); + + if (!module_is_enable()) + goto out; + + if (sysctl_module_block_enable) + goto out; + + if (strstr(current->comm, "swapper")) + goto not_stat; + + type = get_func_type(); + time = rdtsc(); + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + goto out; + + if ((type & STAT_SYSCALL_TYPE) != 0 && nr < NR_syscalls && tk2_curr_ptr->numa_faults_locality[0] > 0) + tk2_curr_ptr->numa_faults_locality[1] += time - tk2_curr_ptr->numa_faults_locality[0]; + if ((type & STAT_FUNC_TYPE) != 0 && nr < NR_syscalls && tk2_curr_ptr->numa_faults_locality[1] > 0) { + tk2_curr_ptr->numa_faults_locality[0] += time - tk2_curr_ptr->numa_faults_locality[1]; + save_sched_out(); + } + if ((type & STAT_PAGEFAULT_TYPE) != 0) + tk2_curr_ptr->numa_faults_locality[1] += time - tk2_curr_ptr->numa_faults_locality[0]; + + +not_stat: + if ((type & STAT_FUNC_TYPE) == 0) + tk2_curr_ptr->numa_faults_locality[0] = 0; + else + tk2_curr_ptr->numa_faults_locality[1] = 0; +out: + return; +} +/* + * exit scheduer or interrupt + */ +void stat_time_finish(void) +{ + unsigned long type; + + if (sysctl_module_block_enable) + goto out; + + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + goto out; + + type = get_func_type(); + if ((type & STAT_FUNC_TYPE) == 0) { + tk2_curr_ptr->numa_faults_locality[0] = rdtsc(); + save_sched_in(); + } + else + tk2_curr_ptr->numa_faults_locality[1] = rdtsc(); +out: + + return; +} + +/* + * stat syscall + */ +void set_sys_nr(unsigned long nr) +{ + tk2_curr_ptr->numa_faults_locality[2] = nr << 3; +} +unsigned long get_sys_nr(void) +{ + return tk2_curr_ptr->numa_faults_locality[2] >> 3; +} +void set_func_type(int pos) +{ + tk2_curr_ptr->numa_faults_locality[2] |= pos; +} +unsigned long get_func_type(void) +{ + return tk2_curr_ptr->numa_faults_locality[2] & 0x07; +} +void clr_func_type(int pos) +{ + tk2_curr_ptr->numa_faults_locality[2] ^= pos; +} + +void stat_stat_syscall_enter(struct kret_data *data) +{ + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + goto out; + + set_func_type(STAT_SYSCALL_TYPE); + tk2_curr_ptr->numa_faults_locality[0] = rdtsc(); + tk2_curr_ptr->numa_faults_locality[1] = 0; + tk2_curr_ptr->numa_pages_migrated = tk2_curr_ptr->numa_faults_locality[0]; +out: + return; +} + +void stat_stat_syscall_exit(unsigned long nr, struct kret_data *data) +{ + int cpu = smp_processor_id(); + unsigned long nr_tmp = nr, time2; + + time2 = rdtsc(); + if (module_is_enable() && nr_tmp < NR_syscalls) { + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + goto out; + nr_tmp = array_index_nospec(nr, NR_syscalls); + if (nr_tmp < NR_syscalls + && tk2_curr_ptr->numa_faults_locality[0] >= tk2_curr_ptr->numa_pages_migrated + && tk2_curr_ptr->numa_faults_locality[1] < time2 - tk2_curr_ptr->numa_pages_migrated) { + if (!is_in_range_v((unsigned long)(&stat_sys_num[cpu][nr_tmp]), __func__, __LINE__) + || !is_in_range_v((unsigned long)(&stat_sys_time[cpu][nr_tmp]), __func__, __LINE__) + || !is_in_range_v((unsigned long)(&stat_sys_time_block[cpu][nr_tmp]), __func__, __LINE__)) + goto out; + + stat_sys_num[cpu][nr_tmp]++; + stat_sys_time[cpu][nr_tmp] += time2 - tk2_curr_ptr->numa_faults_locality[0] + tk2_curr_ptr->numa_faults_locality[1]; + stat_sys_time_block[cpu][nr_tmp] += time2 - tk2_curr_ptr->numa_pages_migrated; + } + tk2_curr_ptr->numa_faults_locality[0] = 0; + tk2_curr_ptr->numa_faults_locality[2] = 0; + } +out: + return; +} +/* + * stat normal function + * */ +void stat_func_enter(struct kret_data *data) +{ + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + return; + + if (tk2_curr_ptr->numa_pages_migrated == 0) { + tk2_curr_ptr->total_numa_faults = 0; + clr_func_type(STAT_FUNC_TYPE); + } + tk2_curr_ptr->total_numa_faults++; + if (tk2_curr_ptr->total_numa_faults > 1) + return; + + set_func_type(STAT_FUNC_TYPE); + tk2_curr_ptr->numa_pages_migrated = rdtsc(); + tk2_curr_ptr->numa_faults_locality[1] = tk2_curr_ptr->numa_pages_migrated; + tk2_curr_ptr->numa_faults_locality[0] = 0; + + return; +} +void stat_func_exit(struct kret_data *data) +{ + int cpu = smp_processor_id(); + unsigned long time2; + + if (!tk2_curr_ptr) + current->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_ATOMIC); + if (!tk2_curr_ptr) + return; + + tk2_curr_ptr->total_numa_faults--; + if (tk2_curr_ptr->total_numa_faults > 0) + return; + + time2 = rdtsc(); + if (module_is_enable() && stat_sys_num) { + if (tk2_curr_ptr->numa_faults_locality[1] > 0 && time2 >= tk2_curr_ptr->numa_faults_locality[1] + && tk2_curr_ptr->numa_faults_locality[1] >= tk2_curr_ptr->numa_pages_migrated + && tk2_curr_ptr->numa_faults_locality[0] < (time2 - tk2_curr_ptr->numa_pages_migrated)) { + stat_func_num[cpu]++; + stat_func_time[cpu] += time2 - tk2_curr_ptr->numa_faults_locality[1] + tk2_curr_ptr->numa_faults_locality[0]; + stat_func_time_block[cpu] += time2 - tk2_curr_ptr->numa_pages_migrated; + } + clr_func_type(STAT_FUNC_TYPE); + tk2_curr_ptr->numa_faults_locality[1] = 0; + tk2_curr_ptr->numa_faults_locality[0] = 0; + } + + return; +} + +/* + * process irq time + */ +#ifdef CONFIG_ARM64 +void stat_gic_handle_irq(struct pt_regs *regs) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + gic_handle_irq(regs); + + context_check_start(); + stat_time_finish(); + + exit_hook(); +} +#endif +#ifdef CONFIG_X86_64 +#ifdef TK5 +#define IRQ_HANDLER(name) \ +void stat_##name(struct irq_desc *desc) \ +{ \ + enter_hook(); \ +\ + stat_time_start(); \ +\ + context_check_end(); \ + __##name(desc); \ +\ + context_check_start(); \ + stat_time_finish(); \ +\ + exit_hook(); \ +\ + return; \ +\ +} +IRQ_HANDLER(handle_level_irq); +IRQ_HANDLER(handle_fasteoi_irq); +IRQ_HANDLER(handle_edge_irq); +IRQ_HANDLER(handle_simple_irq); +#else +unsigned int stat_do_IRQ(struct pt_regs *regs) +{ + int ret; + enter_hook(); + + stat_time_start(); + context_check_end(); + + ret = do_IRQ(regs); + + context_check_start(); + stat_time_finish(); + + exit_hook(); + + return ret; +} +#endif +#endif + +/* + * process scheduler time + */ +/* __schedule/prepare_task_switch */ +/* support > 5.4.203: has sched_rqm_switch*/ +void stat_sched_rqm_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + sched_rqm_switch(rq, prev, next); + + context_check_start(); + exit_hook(); + return; +} + +/* support 5.4.119: has no sched_rqm_switch*/ +void stat_rcu_note_context_switch(bool preempt) +{ + enter_hook(); + + stat_time_start(); + context_check_end(); + + rcu_note_context_switch(preempt); + + context_check_start(); + exit_hook(); +} +/*psi_task_switch for tk5: prev leave, stat prev time; next come, recored start time +* as, tk5 cannot hook finish task switch. +*/ +#ifdef TK5 +void stat_psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) +{ + int cpu = smp_processor_id(); + unsigned long nr = get_sys_nr(); + unsigned long type, time; + + enter_hook(); + + if (!module_is_enable()) + goto out; + + if (sysctl_module_block_enable) + goto out; + + if (strstr(prev->comm, "swapper")) + goto not_stat; + type = get_func_type(); + time = rdtsc(); + + if (!tk2_prev_ptr || !tk2_next_ptr) + goto out; + + if (tk2_prev_ptr->numa_faults_locality[1] == 0 && tk2_prev_ptr->numa_faults_locality[0] > 0) { + tk2_prev_ptr->numa_faults_locality[0] = 0; + tk2_prev_ptr->numa_faults_locality[2] = 0; + tk2_prev_ptr->total_numa_faults = 0; + } + if ((type & STAT_SYSCALL_TYPE) != 0 && nr < NR_syscalls + && tk2_prev_ptr->numa_faults_locality[0] >= tk2_prev_ptr->numa_pages_migrated + && tk2_prev_ptr->numa_faults_locality[0] > 0) { + tk2_prev_ptr->numa_faults_locality[1] += time - tk2_prev_ptr->numa_faults_locality[0]; + } + if ((type & STAT_FUNC_TYPE) != 0 && nr < NR_syscalls + && tk2_prev_ptr->numa_faults_locality[1] >= tk2_prev_ptr->numa_pages_migrated + && tk2_prev_ptr->numa_faults_locality[1] > 0) { + tk2_prev_ptr->numa_faults_locality[0] += time - tk2_prev_ptr->numa_faults_locality[1]; + save_sched_out(); + } + if ((type & STAT_PAGEFAULT_TYPE) != 0) { + time = rdtsc(); + tk2_prev_ptr->numa_faults_locality[1] += time - tk2_prev_ptr->numa_faults_locality[0]; + } + +not_stat: + if (type != 0 && ((type & STAT_FUNC_TYPE) == 0)) { + tk2_prev_ptr->numa_faults_locality[0] = 0; + tk2_next_ptr->numa_faults_locality[0] = rdtsc(); + } + else if ((type & STAT_FUNC_TYPE) != 0){ + tk2_prev_ptr->numa_faults_locality[1] = 0; + tk2_next_ptr->numa_faults_locality[1] = rdtsc(); + } + +out: + context_check_end(); + __psi_task_switch(prev, next, sleep); + + context_check_start(); + exit_hook(); +} +#endif +struct rq *stat_finish_task_switch(struct task_struct *prev) +{ + unsigned long type; + struct rq *rq; + + enter_hook(); + context_check_end(); + + rq = finish_task_switch(prev); + + context_check_start(); + stat_time_finish(); + + exit_hook(); + + return rq; +} +void stat__sched_fork(struct task_struct *p) +{ + unsigned long flags; + + enter_hook(); + + __sched_fork(p); + if (p) { + local_irq_save(flags); + p->node_stamp = (u64)kzalloc(sizeof(os_aware_t), GFP_KERNEL); + local_irq_restore(flags); + } + exit_hook(); +} + +void stat_do_exit(long code) +{ + enter_hook(); + + if (tk2_curr_ptr) + kfree(tk2_curr_ptr); + + exit_hook(); + + do_exit(code); + +} + diff --git a/ops/os_stat/os_stat/catch_signal.c b/ops/os_stat/os_stat/catch_signal.c new file mode 100644 index 0000000000000000000000000000000000000000..bc68a66a09f39a8d27ad6c6c3e443c2bf4cae875 --- /dev/null +++ b/ops/os_stat/os_stat/catch_signal.c @@ -0,0 +1,39 @@ +/* + * Kernel dynamic hooks based on ftrace + * aurelianliu@tencent.com + */ +#include +#include "kprobe_prehook.h" +#include "data_aware.h" + +extern unsigned long (*p__test_func1)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +void catch_kill_signal(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + int index, len; + char *proc; + struct task_struct *target = NULL; + + if (sysctl_trace_type <= 2) + return; + + if (((sysctl_trace_type - 2) == arg1) && check_func_name("kill_something_info")) { + index = message_end++ % message_total_index; + if (message_info_stat) { + if (p__test_func1) + target = test_func1(arg3, 0, 0, 0, 0, 0, 0, 0); + + message_info_stat[index].nr = current->pid; + message_info_stat[index].num = arg3; + proc = message_info_stat[index].func; + strncpy(message_info_stat[index].func, current->comm, NAME_MAX >> 1); + if (target) { + proc += (NAME_MAX >> 1); + strncpy(proc, target->comm, NAME_MAX >> 1); + put_task_struct(target); + } + } + } +} diff --git a/ops/os_stat/os_stat/data_aware.c b/ops/os_stat/os_stat/data_aware.c new file mode 100644 index 0000000000000000000000000000000000000000..490a41985b2fef3cb72f7fff308a79bf9d6072bc --- /dev/null +++ b/ops/os_stat/os_stat/data_aware.c @@ -0,0 +1,1367 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "version.h" +#ifndef TK2 +#include +#else +#include +#endif +#ifndef TK4_NEW_NEW +#include "slab.h" +#endif +#include +#include +#ifndef CONFIG_ARM64 +#include +#endif +#include "data_aware.h" +#include "hook.h" +#include "syms.h" +#include "scene_layer.h" + +#ifdef CONFIG_ARM64 +#define cpu_smt_mask(cpu) cpumask_of(cpu) +#endif + +#ifndef TK2 +extern unsigned int nr_cpu_ids; +#else +extern int nr_cpu_ids; +#endif +/* Global instance */ +unsigned int sysctl_module_disable = 0;//for unload module only +unsigned int sysctl_module_enable = 0; // enable module by sysctl +unsigned int sysctl_module_debug = 0; // clean data or stat stat +unsigned int sysctl_trace_type = 0; // trace tyep:get delta time between two function +unsigned int sysctl_module_monitor_sampling_rate = 1; // sample rate base for monitor +unsigned int sysctl_module_print = 0; // print struct info for debug +unsigned int sysctl_module_block_enable = 0; //enable stat block for Throughput +unsigned int message_start = 0; //loop to save info, start index +unsigned int message_end = 0; //loop to save info, start index +unsigned int message_total_index = 1; //message total index +struct func_latency *message_info_stat; +struct func_latency sysctl_data[TRACK_SYSCALL_NUM]; +static unsigned long irq_stat_data[4]; +static unsigned long pagefault_stat_data; +static unsigned long access_vm_stat_time; // fuzzy stat, for access is fewer by proc/cmdline etc. +static unsigned long access_vm_stat_num; // fuzzy stat, for access is fewer by proc/cmdline etc. +char sysctl_module_process_comm[NAME_MAX] = "0"; // trace process assigned by pid + +unsigned long **stat_sys_num = NULL; +unsigned long **stat_sys_time = NULL; +unsigned long **stat_sys_time_block = NULL; +unsigned long *stat_func_num = NULL; +unsigned long *stat_func_time = NULL; +unsigned long *stat_func_time_block = NULL; +static unsigned long *stat_cpu_idle; +static unsigned long *stat_cpu_num; +static unsigned long *stat_node_num; +static unsigned long *stat_pagefault_num; +static unsigned long *stat_pagefault_time; +static unsigned long *stat_pagefault_time_block; +static unsigned long *stat_allocpage_num; +static unsigned long *stat_allocpage_time_block; +static unsigned long *stat_slub_alloc_num; +static unsigned long *stat_slub_alloc_size; +static unsigned long *stat_slub_alloc_time_block; +static unsigned long *stat_bio_num; +static unsigned long *stat_bio_size; +static unsigned long *stat_submit_bio_time; +static unsigned long *stat_submit_bio_time_block; +static unsigned long *stat_dispatch_time_block; +static unsigned long *stat_end_bio_time; +static unsigned long *stat_end_bio_time_block; +static unsigned long *stat_bio_time_block; +static unsigned long *stat_bio_disk_num; +static unsigned long *stat_bio_disk_blocknum; +static int *nr_running_per_node; +static int *idle_cpu_stat; +static int *idle_cpu_stat_char; +static unsigned long *stat_sys_num_tmp; +static long *stat_sys_time_tmp; +static long *stat_sys_time_tmp_block; +static unsigned long *stat_mem_node = NULL; +static struct func_latency *data_ptr; +struct func_latency *func_data; +struct address_range data_range_v, data_range_k; + +bool module_is_enable(void) +{ + + if (!sysctl_module_disable && sysctl_module_enable && sysctl_module_debug != FTRACE_CONTROL_CLEAR) { + if (strcmp(sysctl_module_process_comm, "0") && !strstr(sysctl_module_process_comm, current->comm)) + return false; + return true; + } + + return false; +} + +void unload_disable_module(void) +{ + sysctl_module_disable = 1; +} + +inline bool is_in_range_v(unsigned long addr, char *func, int line) +{ + bool ret; + + ret = (addr >= data_range_v.start && addr < data_range_v.end); + if (!ret) { + sysctl_module_enable = 0; + unload_disable_module(); + pr_err("is_in_range_v, Error address in func:%s, line:%d, addr:%lx, range:[%lx, %lx]", func, line, + addr, data_range_v.start, data_range_v.end); + } + + return ret; +} +inline bool is_in_range_k(unsigned long addr, char *func, int line) +{ + bool ret; + + ret = (addr >= data_range_k.start && addr < data_range_k.end); + if (!ret) { + sysctl_module_enable = 0; + unload_disable_module(); + pr_err("is_in_range_k, Error address in func:%s, line:%d, addr:%lx, range:[%lx, %lx]", func, line, + addr, data_range_k.start, data_range_k.end); + } + + return ret; +} +/* + * stat idle or not about other logic cpu, which belone to the same core with current cpu + */ +int test_idle_cpu(struct rq *rq, struct task_struct *p) +{ + if (p != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP +#ifdef TK5 + if (rq->ttwu_pending) +#else + if (!llist_empty(&rq->wake_list)) +#endif + return 0; +#endif + + return 1; +} +/* + * stat scheduler + */ +#ifdef TK2 +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif +#endif +struct task_struct * stat_pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags_stat *rf) +{ + int cpu = smp_processor_id(), target, idle = 0; + struct task_struct *p; + + enter_hook_system(); + context_check_end(); + + p = pick_next_task(rq, prev, rf); + + context_check_start(); + if (module_is_enable() && stat_cpu_idle && stat_node_num) { + for_each_cpu(target, cpu_smt_mask(cpu)){ + if (target == cpu) + continue; + idle += test_idle_cpu(rq, p); + } + if (!is_in_range_v((unsigned long)&stat_cpu_idle[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_cpu_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_node_num[cpu], __func__, __LINE__)) + goto out; + stat_cpu_idle[cpu] += idle; + stat_cpu_num[cpu]++; + stat_node_num[cpu_to_node(cpu) % 2]++; + } +out: + exit_hook_system(); + + return p; +} + +/* + * stat which inode mem is alloced from + */ +struct page *stat_alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + unsigned long time = 0; + struct page *page; + + enter_hook_system(); + time = rdtsc(); + context_check_end(); + + page = __alloc_pages_nodemask(gfp_mask, order, preferred_nid, nodemask); + + context_check_start(); + if (module_is_enable()) { + int nid; + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_allocpage_time_block[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_allocpage_num[cpu], __func__, __LINE__)) + goto out; + stat_allocpage_time_block[cpu] += rdtsc() - time; + stat_allocpage_num[cpu]++; + if (!page) + goto out; + nid = page_to_nid(page); + if (nid >= 0 && nid < nr_node_ids && stat_mem_node) { + if (!is_in_range_v((unsigned long)&stat_mem_node[nid], __func__, __LINE__)) + goto out; + stat_mem_node[nid] += (1 << order); + } + } +out: + exit_hook_system(); + + return page; + +} +void *stat__kmalloc(size_t size, gfp_t flags) +{ + void *ret; + unsigned long time = 0; + + enter_hook_system(); + + time = rdtsc(); + context_check_end(); + + ret = test__kmalloc(size, flags); + + context_check_start(); + if (module_is_enable()) { + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_slub_alloc_size[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_time_block[cpu], __func__, __LINE__)) + goto out; + stat_slub_alloc_size[cpu] += size; + stat_slub_alloc_num[cpu]++; + stat_slub_alloc_time_block[cpu] += rdtsc() - time; + } +out: + exit_hook_system(); + return ret; +} +void *stat__kmalloc_node(size_t size, gfp_t flags, int node) +{ + void *ret; + unsigned long time = 0; + + enter_hook_system(); + + time = rdtsc(); + context_check_end(); + + ret = test__kmalloc_node(size, flags, node); + + context_check_start(); + if (module_is_enable()) { + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_slub_alloc_size[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_time_block[cpu], __func__, __LINE__)) + goto out; + stat_slub_alloc_size[cpu] += size; + stat_slub_alloc_num[cpu]++; + stat_slub_alloc_time_block[cpu] += rdtsc() - time; + } +out: + exit_hook_system(); + return ret; +} +void *stat_kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + void *ret; + unsigned long time = 0; + + enter_hook_system(); + + time = rdtsc(); + context_check_end(); + + ret = test_kmem_cache_alloc(s, gfpflags); + + context_check_start(); + if (module_is_enable() && s) { + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_slub_alloc_size[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_slub_alloc_time_block[cpu], __func__, __LINE__)) + goto out; + stat_slub_alloc_size[cpu] += s->object_size; + stat_slub_alloc_num[cpu]++; + stat_slub_alloc_time_block[cpu] += rdtsc() - time; + } + +out: + exit_hook_system(); + return ret; +} + +blk_qc_t stat_submit_bio(struct bio *bio) +{ +#ifndef TK2 + blk_qc_t ret; + unsigned long time; + + enter_hook_system(); + + if (module_is_enable() && bio) { + current->numa_faults_locality[1] = 0; + time = rdtsc(); + current->numa_faults_locality[0] = time; + + } + context_check_end(); + + ret = test_submit_bio(bio); + + context_check_start(); + if (module_is_enable() && bio) { + int cpu = smp_processor_id(); + struct bio *bio_tmp; +#ifndef TK3 + if (!is_in_range_v((unsigned long)&stat_bio_size[cpu], __func__, __LINE__)) + goto out; + bio->kabi_reserved1 = 0x55aa; + bio->kabi_reserved2 = rdtsc(); + for (bio_tmp = bio; bio_tmp; bio_tmp = bio_tmp->bi_next) { + stat_bio_size[cpu] += bio_tmp->bi_iter.bi_size; + bio_tmp->kabi_reserved1 = 0x55aa; + bio_tmp->kabi_reserved2 = bio->kabi_reserved2; + } + if (current->numa_faults_locality[0] >= time) { + if (!is_in_range_v((unsigned long)&stat_submit_bio_time[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_submit_bio_time_block[cpu], __func__, __LINE__)) + goto out; + stat_submit_bio_time[cpu] += bio->kabi_reserved2 - current->numa_faults_locality[0] + current->numa_faults_locality[1]; + stat_submit_bio_time_block[cpu] += bio->kabi_reserved2 - time; + } +#endif + } +out: + exit_hook_system(); + + return ret; +#endif +} +//for tk5 +#ifdef TK5 +bool stat_blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets) +{ + bool ret; + unsigned long time; + struct request *rq; + struct bio *bio, *bio_tmp; + + if (module_is_enable()) { + time = rdtsc(); + list_for_each_entry(rq, list, queuelist) { + int cpu = smp_processor_id(); + if (!rq->bio) + continue; + + bio = rq->bio; + if (bio->kabi_reserved2 > 0 && time > bio->kabi_reserved2 && (bio->kabi_reserved1 == 0x55aa)) + stat_dispatch_time_block[cpu] += time - bio->kabi_reserved2; + + for (bio_tmp = rq->bio; bio_tmp; bio_tmp = bio_tmp->bi_next) + bio_tmp->kabi_reserved2 = rdtsc(); + } + } + ret = test_blk_mq_dispatch_rq_list(hctx, list, nr_budgets); + + return ret; +} +//for tk4 +#else +bool stat_blk_mq_get_driver_tag(struct request *rq) +{ + bool ret; + unsigned long time; + struct bio *bio, *bio_tmp; + + enter_hook_system(); + +#if !defined(TK3) && !defined(TK2) + if (module_is_enable() && rq && rq->bio) { + int cpu = smp_processor_id(); + time = rdtsc(); + + bio = rq->bio; + if (bio->kabi_reserved2 > 0 && time > bio->kabi_reserved2 && (bio->kabi_reserved1 == 0x55aa)) + stat_dispatch_time_block[cpu] += time - bio->kabi_reserved2; + + for (bio_tmp = rq->bio; bio_tmp; bio_tmp = bio_tmp->bi_next) + bio_tmp->kabi_reserved2 = rdtsc(); + } +#endif + context_check_end(); + + ret = test_blk_mq_get_driver_tag(rq); + + context_check_start(); + exit_hook_system(); + + return ret; +} +#endif + +int find_index(int start, int total, int partno) +{ + int i, index = -1; + + for(i = start; i < start + total; i++) { + if (stat_bio_disk_blocknum[i] == partno) + return i; + if (stat_bio_disk_blocknum[i] == 0 && index == -1) + index = i; + } + + return index; +} +void stat_bio_endio(struct bio *bio) +{ + unsigned long time; + + enter_hook_system(); + + if (module_is_enable() && bio) { + int cpu = smp_processor_id(), index; + int major, partno, first_minor; + + time = rdtsc(); +#if !defined(TK3) && !defined(TK2) + if (bio->kabi_reserved2 > 0 && time > bio->kabi_reserved2 && (bio->kabi_reserved1 == 0x55aa)) { + if (!is_in_range_v((unsigned long)&stat_bio_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_time_block[cpu], __func__, __LINE__)) + goto next; + stat_bio_num[cpu]++; + stat_bio_time_block[cpu] += time - bio->kabi_reserved2; +#ifdef TK5 + if (!bio->bi_bdev || !bio->bi_bdev->bd_disk) + goto next; + major = bio->bi_bdev->bd_disk->major; + partno = bio->bi_bdev->bd_partno; + first_minor = bio->bi_bdev->bd_disk->first_minor; +#else + major = bio->bi_disk->major; + partno = bio->bi_partno; + first_minor = bio->bi_disk->first_minor; +#endif + switch (major) { + case DISK_SDA: + index = find_index(0, DISK_SDA_NUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_HDD: + index = find_index(DISK_SDA_NUM, DISK_HDD_SUBNUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_HDD1: + index = find_index(DISK_SDA_NUM + DISK_HDD_SUBNUM, DISK_HDD_SUBNUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_HDD2: + index = find_index(DISK_SDA_NUM + 2 * DISK_HDD_SUBNUM, DISK_HDD_SUBNUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_HDD3: + index = find_index(DISK_SDA_NUM + 3 * DISK_HDD_SUBNUM, DISK_HDD_SUBNUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_DM: + index = find_index(DISK_SDA_NUM + DISK_HDD_NUM, DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_VD: + index = find_index(DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM, DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + case DISK_NVME: + index = find_index(DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM, DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM + DISK_NVME_NUM, partno); + if (index < 0) + break; + if (!is_in_range_v((unsigned long)&stat_bio_disk_num[index], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_bio_disk_blocknum[index], __func__, __LINE__)) + goto next; + stat_bio_disk_num[index]++; + stat_bio_disk_blocknum[index] = partno; + break; + default: + break; + } + + bio->kabi_reserved1 = 0; + bio->kabi_reserved2 = 0; + } + time = rdtsc(); + current->numa_faults_locality[1] = 0; + current->numa_faults_locality[0] = time; +#endif + } +next: + context_check_end(); + test_bio_endio(bio); + + context_check_start(); +#if !defined(TK2) + if (module_is_enable()) { + if (!is_in_range_v((unsigned long)&stat_end_bio_time[smp_processor_id()], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_end_bio_time_block[smp_processor_id()], __func__, __LINE__)) + goto out; + stat_end_bio_time[smp_processor_id()] += rdtsc() - current->numa_faults_locality[0] + current->numa_faults_locality[1]; + stat_end_bio_time_block[smp_processor_id()] += rdtsc() - time; + } +#endif +out: + exit_hook_system(); +} +#ifdef TK5 +/* page fault for tk5*/ +vm_fault_t stat_handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs) +{ + vm_fault_t ret; + unsigned long time; + + enter_hook_system(); + + current->numa_faults_locality[1] = 0; + + set_func_type(STAT_PAGEFAULT_TYPE); + time = rdtsc(); + current->numa_faults_locality[0] = time; + context_check_end(); + + ret = handle_mm_fault(vma, address, flags, regs); + + context_check_start(); + if (module_is_enable() && stat_pagefault_num && current->numa_faults_locality[0] >= time) { + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_pagefault_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_pagefault_time[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_pagefault_time_block[cpu], __func__, __LINE__)) + goto out; + stat_pagefault_num[cpu]++; + stat_pagefault_time[cpu] += rdtsc() - current->numa_faults_locality[0] + current->numa_faults_locality[1]; + stat_pagefault_time_block[cpu] += rdtsc() - time; + } +out: + current->numa_faults_locality[2] = 0; + + exit_hook_system(); + + return ret; +} +#endif +/* page fault for tk4*/ +void stat_do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + unsigned long time; + + enter_hook_system(); + +#if !defined(TK2) + current->numa_faults_locality[1] = 0; + + set_func_type(STAT_PAGEFAULT_TYPE); + time = rdtsc(); + current->numa_faults_locality[0] = time; +#endif + context_check_end(); + + do_page_fault(regs, error_code, address); + + context_check_start(); +#if !defined(TK2) + if (module_is_enable() && stat_pagefault_num && current->numa_faults_locality[0] >= time) { + int cpu = smp_processor_id(); + if (!is_in_range_v((unsigned long)&stat_pagefault_num[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_pagefault_time[cpu], __func__, __LINE__) + || !is_in_range_v((unsigned long)&stat_pagefault_time_block[cpu], __func__, __LINE__)) + goto out; + stat_pagefault_num[cpu]++; + stat_pagefault_time[cpu] += rdtsc() - current->numa_faults_locality[0] + current->numa_faults_locality[1]; + stat_pagefault_time_block[cpu] += rdtsc() - time; + } +out: + current->numa_faults_locality[2] = 0; +#endif + + exit_hook_system(); +} + +int stat_access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + int ret; + unsigned long time = rdtsc(); + + ret = test_access_remote_vm(mm, addr, buf, len, gup_flags); + + if (module_is_enable()) { + access_vm_stat_time += rdtsc() - time; + access_vm_stat_num++; + } + + return ret; +} + +void sys_get_latency(struct func_latency **data) +{ + int i, j, k, cpu, busy = 0; + unsigned long syscall_max[PRINT_SYSCALL_NUM][2] = {0}; + unsigned long func_num = 0, func_time = 0, func_block_time = 0, alloc_page_block_time = 0; + unsigned long bio_block_time = 0, bio_num = 0, allloc_page_num = 0, bio_size = 0; + unsigned long slub_block_time = 0, slub_num = 0, slub_size = 0; + unsigned long submit_bio_time = 0, end_bio_time = 0, submit_bio_time_block = 0, end_bio_time_block = 0; + unsigned long dispatch_bio_time_block = 0; + unsigned long irq_time = 0, softirq_time = 0, irq_num = 0, softirq_num = 0, pagefault_hit = 0; + int nid, idle_cur = 0, idle_smt = 0, target; + + if (!data) + return; + + if (!is_in_range_v((unsigned long)stat_sys_num_tmp, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_num_tmp + 3 * NR_syscalls - 1), __func__, __LINE__)) + return; + if (!is_in_range_v((unsigned long)stat_sys_time_tmp, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_time_tmp + NR_syscalls - 1), __func__, __LINE__)) + return; + if (!is_in_range_v((unsigned long)stat_sys_time_tmp_block, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_time_tmp_block + NR_syscalls - 1), __func__, __LINE__)) + return; + + memset(stat_sys_num_tmp, 0, 3 * NR_syscalls * sizeof(unsigned long)); + for(i = 0; i < NR_syscalls; i++) { + for_each_possible_cpu(cpu) { + if (!is_in_range_v((unsigned long)&stat_sys_num[cpu][i], __func__, __LINE__)) + continue; + stat_sys_num_tmp[i] += stat_sys_num[cpu][i]; + stat_sys_time_tmp[i] += stat_sys_time[cpu][i]; + stat_sys_time_tmp_block[i] += stat_sys_time_block[cpu][i]; + } + for (j = 0; j < PRINT_SYSCALL_NUM; j++) { + if (stat_sys_num_tmp[i] > syscall_max[j][1]) { + for (k = PRINT_SYSCALL_NUM - 1; k > j; k--) { + syscall_max[k][0] = syscall_max[k - 1][0]; + syscall_max[k][1] = syscall_max[k - 1][1]; + } + syscall_max[j][0] = i; + syscall_max[j][1] = stat_sys_num_tmp[i]; + break; + } + } + } + + for_each_possible_cpu(cpu) { + struct kernel_cpustat *kcs = &kcpustat_cpu(cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + struct vm_event_state *vm_stat = &per_cpu(vm_event_states, cpu); +#endif + func_num += stat_pagefault_num[cpu]; + func_time += stat_pagefault_time[cpu]; + func_block_time += stat_pagefault_time_block[cpu]; + allloc_page_num += stat_allocpage_num[cpu]; + alloc_page_block_time += stat_allocpage_time_block[cpu]; + slub_size += stat_slub_alloc_size[cpu]; + slub_block_time += stat_slub_alloc_time_block[cpu]; + slub_num += stat_slub_alloc_num[cpu]; + bio_num += stat_bio_num[cpu]; + bio_size += stat_bio_size[cpu]; + bio_block_time += stat_bio_time_block[cpu]; + submit_bio_time += stat_submit_bio_time[cpu]; + submit_bio_time_block += stat_submit_bio_time_block[cpu]; + dispatch_bio_time_block += stat_dispatch_time_block[cpu]; + end_bio_time += stat_end_bio_time[cpu]; + end_bio_time_block += stat_end_bio_time_block[cpu]; + irq_time += kcs->cpustat[CPUTIME_IRQ]; + softirq_time += kcs->cpustat[CPUTIME_SOFTIRQ]; +#ifdef CONFIG_X86_64 + irq_num += kstat_cpu_irqs_sum(cpu) + stat_arch_irq_stat_cpu(cpu); +#endif + + for (i = 0; i < NR_SOFTIRQS; i++) + softirq_num += kstat_softirqs_cpu(i, cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + pagefault_hit += vm_stat->event[PGMAJFAULT]; +#endif + } +#ifdef CONFIG_X86_64 + irq_num += stat_arch_irq_stat(); +#endif + + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + (*data)[i].nr = syscall_max[i][0]; + (*data)[i].num = syscall_max[i][1]; + (*data)[i].latency = 0; + (*data)[i].block_latency = 0; + if (syscall_max[i][1]) { + (*data)[i].latency = stat_sys_time_tmp[(*data)[i].nr] / syscall_max[i][1]; + (*data)[i].block_latency = stat_sys_time_tmp_block[(*data)[i].nr] / syscall_max[i][1]; + if ((*data)[i].latency > (*data)[i].block_latency) + (*data)[i].latency = (*data)[i].block_latency; + } +#ifdef TK5 + sprintf((*data)[i].func, "%pS", stat_sys_call_table[syscall_max[i][0]]); +#else + sprintf((*data)[i].func, "%pF", stat_sys_call_table[syscall_max[i][0]]); +#endif + } + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 0 */ + (*data)[i].nr = 0; + (*data)[i].num = access_vm_stat_num; + if (access_vm_stat_num) { + (*data)[i].latency = 0; + (*data)[i].block_latency = access_vm_stat_time / access_vm_stat_num; + } + sprintf((*data)[i++].func, "access vm stat, for /proc/$pid/cmdline,/proc/$pid/mem, /proc/$pid/environ etc."); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 1 */ + (*data)[i].nr = 0; + (*data)[i].num = irq_num - irq_stat_data[2]; + if (irq_num) + (*data)[i].latency = irq_time - irq_stat_data[0]; + sprintf((*data)[i++].func, "irq stat"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 2 */ + (*data)[i].nr = 0; + (*data)[i].num = softirq_num - irq_stat_data[3]; + if (softirq_num) { + (*data)[i].latency = softirq_time - irq_stat_data[1]; + } + sprintf((*data)[i++].func, "soft irq stat"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 3 */ + (*data)[i].nr = pagefault_hit - pagefault_stat_data; + (*data)[i].num = func_num; + if (func_num) { + (*data)[i].latency = func_time / func_num; + (*data)[i].block_latency = func_block_time / func_num; + } + strcpy((*data)[i++].func, "do_page_fault"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 4 */ + (*data)[i].nr = slub_size >> 10; + (*data)[i].num = slub_num; + if (slub_num) { + (*data)[i].latency = 0; + (*data)[i].block_latency = slub_block_time / slub_num; + } + strcpy((*data)[i++].func, "slub:__kmalloc/__kmalloc_node/kmemcachealloc"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 5 , + 6 */ + j = 0; + for_each_node(nid) { + (*data)[i].nr = nid; + (*data)[i].num = (stat_mem_node[nid] * PAGE_SIZE) >> 10; + (*data)[i].latency = allloc_page_num; + if (allloc_page_num) + (*data)[i].block_latency = alloc_page_block_time / allloc_page_num; + sprintf((*data)[i].func, "alloc pages, total nodes:%d", nr_node_ids); + i++; + if (j++ >= 1) + break; + } + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 6 */ + if (j == 1) + i++; + + memset(nr_running_per_node, 0, nr_node_ids * sizeof(int)); + memset(idle_cpu_stat, 0, nr_cpu_ids * sizeof(int)); + for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + idle_smt = 0; + idle_cur = 0; + for_each_cpu(target, cpu_smt_mask(cpu)){ + if (idle_cpu_stat[target] == 0) +#ifdef TK5 + nr_running_per_node[nid] += per_cpu(*runqueues, target).nr_running; +#else + nr_running_per_node[nid] += cpu_rq(target)->nr_running; +#endif + if (target == cpu) + idle_cur = idle_cpu(target); + else + idle_smt += idle_cpu(target); + } + if (!idle_cur && !idle_smt) { + idle_cpu_stat[cpu] = 1; + busy++; + } + if (idle_cur || idle_smt) + idle_cpu_stat[cpu] = 2; + if (idle_cur && idle_smt) + idle_cpu_stat[cpu] = 3; + } + j = 0; + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 7, + 8 */ + for_each_node(nid) { + (*data)[i].nr = nid; + (*data)[i].num = nr_running_per_node[nid]; + (*data)[i].latency = busy; + (*data)[i].block_latency = 0; + strcpy((*data)[i].func, "rq running per node"); + i++; + if (++j > 1) + break; + } + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 8 */ + if (j == 1) + i++; + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 9 */ + (*data)[i].nr = 0x00; + (*data)[i].num = bio_num; + if (bio_num) { + (*data)[i].latency = submit_bio_time / bio_num; + (*data)[i].block_latency = submit_bio_time_block / bio_num; + } + strcpy((*data)[i++].func, "submit_bio"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 10 */ + (*data)[i].nr = 0x00; + (*data)[i].num = bio_num; + if (bio_num) { + (*data)[i].latency = end_bio_time / bio_num; + (*data)[i].block_latency = end_bio_time_block / bio_num; + } + strcpy((*data)[i++].func, "bio_endio"); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 11 */ + (*data)[i].nr = 0x00; + (*data)[i].num = bio_num; + if (bio_num) { + (*data)[i].latency = 0; + (*data)[i].block_latency = dispatch_bio_time_block / bio_num; + } + sprintf((*data)[i++].func, "submit_bio after -> pre blk_mq_dispatch_rq_list, bytes:%ld", bio_size); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 12 */ + (*data)[i].nr = 0x00; + (*data)[i].num = bio_num; + if (bio_num) { + (*data)[i].latency = 0; + (*data)[i].block_latency = bio_block_time / bio_num; + } + sprintf((*data)[i++].func, "pre blk_mq_dispatch_rq_list -> pre bio_endio, bytes:%ld", bio_size); + + + /* DISK_TOTAL_NUM :0 ~ DISK_TOTAL_NUM */ + for(j = 0; j < DISK_TOTAL_NUM; j++) { + if (j < DISK_SDA_NUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "sdx stat:%d:%ld", DISK_SDA, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + DISK_HDD_SUBNUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "hdd stat:%d:%ld", DISK_HDD, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + 2 * DISK_HDD_SUBNUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "hdd stat:%d:%ld", DISK_HDD1, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + 3 * DISK_HDD_SUBNUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "hdd stat:%d:%ld", DISK_HDD2, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + 4 * DISK_HDD_SUBNUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "hdd stat:%d:%ld", DISK_HDD3, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "dm stat:%d:%ld", DISK_DM, stat_bio_disk_blocknum[j]); + } + else if (j < DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM && stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "vdx stat:%d:%ld", DISK_VD, stat_bio_disk_blocknum[j]); + } + else if (stat_bio_disk_num[j] > 0) { + (*data)[j+i].nr = 0x00; + (*data)[j+i].num = stat_bio_disk_num[j]; + sprintf((*data)[j+i].func, "nvme stat:%d:%ld", DISK_NVME, stat_bio_disk_blocknum[j]); + } + } + + return; +} + +void sys_get_latency_first(struct func_latency (*data)[PRINT_SYSCALL_NUM]) +{ + int i, j, k, cpu; + unsigned long syscall_max[PRINT_SYSCALL_NUM][4] = {0}; + unsigned long func_num = 0, func_time = 0, func_block_time = 0; + + if (!data) + return; + + if (!is_in_range_v((unsigned long)stat_sys_num_tmp, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_num_tmp + 3 * NR_syscalls - 1), __func__, __LINE__)) + return; + if (!is_in_range_v((unsigned long)stat_sys_time_tmp, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_time_tmp + NR_syscalls - 1), __func__, __LINE__)) + return; + if (!is_in_range_v((unsigned long)stat_sys_time_tmp_block, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_sys_time_tmp_block + NR_syscalls - 1), __func__, __LINE__)) + return; + + memset(stat_sys_num_tmp, 0, 3 * NR_syscalls * sizeof(unsigned long)); + for(i = 0; i < NR_syscalls; i++) { + if (!is_in_range_v(&stat_sys_num_tmp[i], __func__, __LINE__) || !is_in_range_v(&stat_sys_time_tmp[i], __func__, __LINE__) + || !is_in_range_v(&stat_sys_time_tmp_block[i], __func__, __LINE__)) + continue; + for_each_possible_cpu(cpu) { + stat_sys_num_tmp[i] += stat_sys_num[cpu][i]; + stat_sys_time_tmp[i] += stat_sys_time[cpu][i]; + stat_sys_time_tmp_block[i] += stat_sys_time_block[cpu][i]; + } + for (j = 0; j < PRINT_SYSCALL_NUM; j++) { + if (stat_sys_time_tmp[i] * stat_sys_num_tmp[syscall_max[j][0]]> syscall_max[j][1] * stat_sys_num_tmp[i]) { + for (k = PRINT_SYSCALL_NUM - 1; k > j; k--) { + syscall_max[k][0] = syscall_max[k - 1][0]; + syscall_max[k][1] = syscall_max[k - 1][1]; + syscall_max[k][2] = syscall_max[k - 1][2]; + syscall_max[k][3] = syscall_max[k - 1][3]; + } + syscall_max[j][0] = i; + syscall_max[j][1] = stat_sys_time_tmp[i]; + syscall_max[j][2] = stat_sys_time_tmp_block[i]; + syscall_max[j][3] = stat_sys_num_tmp[i]; + break; + } + } + } + + for_each_possible_cpu(cpu) { + func_num += stat_pagefault_num[cpu]; + func_time += stat_pagefault_time[cpu]; + func_block_time += stat_pagefault_time_block[cpu]; + } + + for (i = 0; i < PRINT_SYSCALL_NUM - 1; i++) { + (*data)[i].nr = syscall_max[i][0]; + (*data)[i].num = syscall_max[i][2]; + (*data)[i].latency = 0; + (*data)[i].block_latency = 0; + if ((*data)[i].num) { + (*data)[i].latency = syscall_max[i][1] / (*data)[i].num; + (*data)[i].block_latency = syscall_max[i][2] / (*data)[i].num; + } + sprintf((*data)[i].func, "%pF", stat_sys_call_table[syscall_max[i][0]]); + } + + (*data)[i].nr = 0x00; + (*data)[i].num = func_num; + if (func_num) { + (*data)[i].latency = func_time / func_num; + (*data)[i].block_latency = func_block_time / func_num; + } + strcpy((*data)[i].func, "do_page_fault"); + return; +} +/**********************sysctl***************************/ +int sysctl_numa_enable_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + return ret; +} + +int sysctl_get_data_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + size_t len = *lenp >TRACK_SYSCALL_NUM * sizeof(struct func_latency)?TRACK_SYSCALL_NUM * sizeof(struct func_latency):*lenp; + + sys_get_latency(&data_ptr); +#ifdef TK5 + memcpy(buffer, data_ptr, len); +#else + copy_to_user(buffer, (void *)data_ptr, len); +#endif + +#if 0 + int i, j; + for (i = 0; i < PRINT_SYSCALL_NUM - 5; i++) { + pr_info("nr:%4ld, num:%8ld, ave:%8ld(ns), total:%8ld, block ave:%8ld(ns), total:%8ld, func:%s\n", + data[i].nr, data[i].num, data[i].latency, + data[i].num * data[i].latency, data[i].block_latency, + data[i].num * data[i].block_latency, + data[i].func); + } + pr_info("--------memory-----------\n"); + pr_info("num :%8ld, ave latency:%8ld(ns), block ave latency:%8ld(ns), %s\n", data[i].num, data[i].latency, data[i].block_latency, data[i].func); + i++; + j = 0; + for (;i < PRINT_SYSCALL_NUM - 2; i++) { + pr_info("node:%8ld, mem:%8ld(k), ave block time:%ld, %s\n", data[i].nr, data[i].num, data[i].block_latency, data[i].func); + j++; + if (j >= nr_node_ids) { + i++; + break; + } + } + pr_info("--------rq running num-----------\n"); + j = 0; + for (;i < PRINT_SYSCALL_NUM; i++) { + pr_info("node:%8ld, nr running:%8ld, busy:%ld, %s\n", data[i].nr, data[i].num, data[i].latency, data[i].func); + j++; + if (j >= nr_node_ids) + break; + } +#endif + return 0; +} + +int sysctl_get_func_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + int ret = 0, cpu, i, j, count; + unsigned long func_num = 0, func_time = 0, func_block_time = 0; + size_t len = HOOK_FUNC_NUM * sizeof(struct func_latency); + + len = len < *lenp? len:*lenp; + + if (sysctl_trace_type > 2) { + get_info(len, buffer); + return ret; + } + + stat_total_time(); + + for_each_possible_cpu(cpu) { + func_num += stat_func_num[cpu]; + func_time += stat_func_time[cpu]; + func_block_time += stat_func_time_block[cpu]; + } + if (!is_in_range_k((unsigned long)func_data, __func__, __LINE__) + || !is_in_range_k((unsigned long)(func_data + HOOK_FUNC_NUM - 1), __func__, __LINE__)) + return -ENOMEM; + + memset(func_data, 0, HOOK_FUNC_NUM * sizeof(struct func_latency)); + func_data[0].nr = 0x00; + func_data[0].num = func_num; + if (func_num) { + func_data[0].latency = func_time / func_num; + func_data[0].block_latency = func_block_time / func_num; + } + strcpy(func_data[0].func, func_name_new); + j = *lenp / sizeof(struct func_latency); + count = get_one_func_count(); + if (j > HOOK_FUNC_NUM) + j = HOOK_FUNC_NUM; + if (j > count) + j = count; + if (j > 1) { + strcpy(func_data[0].func, get_one_func_name(0)); + for(i = 1; i < j; i++) { + strcpy(func_data[i].func, get_one_func_name(i)); + if (stat_func_total_num[i] == 0) + continue; + func_data[i].num = stat_func_total_num[i]; + func_data[i].latency = stat_func_total_time[i]; + func_data[i].block_latency = stat_func_block_time[i]; + } + } +#ifdef TK5 + memcpy(buffer, &func_data[0], len); +#else + copy_to_user(buffer, func_data, len); +#endif + return ret; +} + +int sysctl_clear_data_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, cpu, i; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (sysctl_module_debug == FTRACE_CONTROL_CLEAR) { + if (!is_in_range_v((unsigned long)stat_sys_num[0], __func__, __LINE__)) + goto out; + memset(stat_sys_num[0], 0, 3 * nr_cpu_ids * NR_syscalls * sizeof(unsigned long)); + if (!is_in_range_v((unsigned long)stat_cpu_idle, __func__, __LINE__)) + goto out; + memset(stat_cpu_idle, 0, 21 * nr_cpu_ids * sizeof(unsigned long)); + memset(irq_stat_data, 0, sizeof(irq_stat_data)); + for_each_possible_cpu(cpu) { + struct kernel_cpustat *kcs = &kcpustat_cpu(cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + struct vm_event_state *vm_stat = &per_cpu(vm_event_states, cpu); +#endif + irq_stat_data[0] += kcs->cpustat[CPUTIME_IRQ]; + irq_stat_data[1] += kcs->cpustat[CPUTIME_SOFTIRQ]; +#ifdef CONFIG_X86_64 + irq_stat_data[2] += kstat_cpu_irqs_sum(cpu) + stat_arch_irq_stat_cpu(cpu); +#endif + for (i = 0; i < NR_SOFTIRQS; i++) + irq_stat_data[3] += kstat_softirqs_cpu(i, cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + pagefault_stat_data += vm_stat->event[PGMAJFAULT]; +#endif + } +#ifdef CONFIG_X86_64 + irq_stat_data[2] += stat_arch_irq_stat(); +#endif + if (!is_in_range_v((unsigned long)stat_mem_node, __func__, __LINE__) + || !is_in_range_v((unsigned long)(stat_mem_node + 2 * nr_node_ids - 1), __func__, __LINE__)) + goto out; + memset(stat_mem_node, 0, 2 * nr_node_ids * sizeof(int)); + if (!is_in_range_v((unsigned long)stat_bio_disk_num, __func__, __LINE__) + || !is_in_range_v((unsigned long)stat_bio_disk_num + 2 * DISK_TOTAL_NUM - 1, __func__, __LINE__)) + goto out; + memset(stat_bio_disk_num, 0, 2 * DISK_TOTAL_NUM * sizeof(unsigned long)); + if (!is_in_range_k((unsigned long)data_ptr, __func__, __LINE__) + || !is_in_range_k((unsigned long)(data_ptr + TRACK_SYSCALL_NUM) - 1, __func__, __LINE__)) + goto out; + memset(data_ptr, 0, TRACK_SYSCALL_NUM * sizeof(struct func_latency)); + if (!is_in_range_k((unsigned long)func_data, __func__, __LINE__) + || !is_in_range_k((unsigned long)(func_data + HOOK_FUNC_NUM - 1), __func__, __LINE__)) + goto out; + memset(func_data, 0, HOOK_FUNC_NUM * sizeof(struct func_latency)); + if (stat_one_func_time) { + if (!is_in_range((unsigned long)stat_one_func_time, __func__, __LINE__) + || is_in_range((unsigned long)stat_one_func_time + stat_one_func_size - 1, __func__, __LINE__)) + goto out; + memset(stat_one_func_time, 0, stat_one_func_size); + } + access_vm_stat_time = 0; + access_vm_stat_num = 0; + } +out: + return ret; +} + +int data_init(void) +{ + int cpu, i, len, total_len; + + total_len = 3 * nr_cpu_ids * sizeof(unsigned long) + 3 * nr_cpu_ids * NR_syscalls * sizeof(unsigned long) + + 21 * nr_cpu_ids * sizeof(unsigned long) + 5 * nr_node_ids * sizeof(int) + + 3 * NR_syscalls * sizeof(unsigned long) + 2 * DISK_TOTAL_NUM * sizeof(unsigned long); + + /* 3 * nr_cpu_ids * sizeof(unsigned long) */ + stat_sys_num = (unsigned long **)vmalloc(total_len); + if (stat_sys_num == NULL) + return -ENOMEM; + + data_range_v.start = (unsigned long)stat_sys_num; + data_range_v.end = (unsigned long)stat_sys_num + total_len; + stat_sys_time = stat_sys_num + nr_cpu_ids; + stat_sys_time_block = stat_sys_num + 2 * nr_cpu_ids; + + /* 3 * nr_cpu_ids * NR_syscalls * sizeof(unsigned long) */ + stat_sys_num[0] = (unsigned long *)(stat_sys_num + 3 * nr_cpu_ids); + + stat_sys_time[0] = stat_sys_num[0] + nr_cpu_ids * NR_syscalls; + stat_sys_time_block[0] = stat_sys_num[0] + 2 * nr_cpu_ids * NR_syscalls; + for_each_possible_cpu(cpu) { + struct kernel_cpustat *kcs = &kcpustat_cpu(cpu); +#ifdef CONFIG_VM_EVENT_COUNTERS + struct vm_event_state *vm_stat = &per_cpu(vm_event_states, cpu); +#endif + irq_stat_data[0] += kcs->cpustat[CPUTIME_IRQ]; + irq_stat_data[1] += kcs->cpustat[CPUTIME_SOFTIRQ]; +#ifdef CONFIG_X86_64 + irq_stat_data[2] += kstat_cpu_irqs_sum(cpu) + stat_arch_irq_stat_cpu(cpu); +#endif + for (i = 0; i < NR_SOFTIRQS; i++) + irq_stat_data[3] += kstat_softirqs_cpu(i, cpu); + +#ifdef CONFIG_VM_EVENT_COUNTERS + pagefault_stat_data += vm_stat->event[PGMAJFAULT]; +#endif + + if (cpu == 0) + continue; + stat_sys_num[cpu] = stat_sys_num[cpu - 1] + NR_syscalls; + stat_sys_time[cpu] = stat_sys_time[cpu - 1] + NR_syscalls; + stat_sys_time_block[cpu] = stat_sys_time_block[cpu - 1] + NR_syscalls; + } +#ifdef CONFIG_X86_64 + irq_stat_data[2] += stat_arch_irq_stat(); +#endif + + len = 21 * nr_cpu_ids * sizeof(unsigned long); + /* len */ + stat_cpu_idle = (unsigned long *)(stat_sys_num[0] + 3 * nr_cpu_ids * NR_syscalls); + + message_info_stat = (struct func_latency *)stat_cpu_idle; + /* reuses the buffer for cycle message */ + message_total_index = len / sizeof(struct func_latency); + + stat_cpu_num = stat_cpu_idle + nr_cpu_ids; + stat_func_num = stat_cpu_idle + 2 * nr_cpu_ids; + stat_func_time = stat_cpu_idle + 3 * nr_cpu_ids; + stat_pagefault_num = stat_cpu_idle + 4 * nr_cpu_ids; + stat_pagefault_time = stat_cpu_idle + 5 * nr_cpu_ids; + stat_pagefault_time_block = stat_cpu_idle + 6 * nr_cpu_ids; + stat_allocpage_num = stat_cpu_idle + 7 * nr_cpu_ids; + stat_allocpage_time_block = stat_cpu_idle + 8 * nr_cpu_ids; + stat_bio_time_block = stat_cpu_idle + 9 * nr_cpu_ids; + stat_bio_num = stat_cpu_idle + 10 * nr_cpu_ids; + stat_bio_size = stat_cpu_idle + 11 * nr_cpu_ids; + stat_func_time_block = stat_cpu_idle + 12 * nr_cpu_ids; + stat_submit_bio_time = stat_cpu_idle + 13 * nr_cpu_ids; + stat_submit_bio_time_block = stat_cpu_idle + 14 * nr_cpu_ids; + stat_end_bio_time = stat_cpu_idle + 15 * nr_cpu_ids; + stat_end_bio_time_block = stat_cpu_idle + 16 * nr_cpu_ids; + stat_dispatch_time_block = stat_cpu_idle + 17 * nr_cpu_ids; + stat_slub_alloc_size = stat_cpu_idle + 18 * nr_cpu_ids; + stat_slub_alloc_num = stat_cpu_idle + 19 * nr_cpu_ids; + stat_slub_alloc_time_block = stat_cpu_idle + 20 * nr_cpu_ids; + + /* nr_node_ids * sizeof(int) */ + nr_running_per_node = (int *)(stat_cpu_idle + 21 * nr_cpu_ids); + + /* nr_cpu_ids * sizeof(int) */ + idle_cpu_stat = (int *)(nr_running_per_node + nr_node_ids); + + /* nr_cpu_ids * sizeof(int) */ + idle_cpu_stat_char = (int *)(idle_cpu_stat + nr_cpu_ids); + + /* 2 * nr_node_ids * sizeof(int) */ + stat_mem_node = (unsigned long *)(idle_cpu_stat_char + nr_cpu_ids); + stat_node_num = (unsigned long *)(stat_mem_node + nr_node_ids); + + /* 3 * NR_syscalls * sizeof(unsigned long) */ + stat_sys_num_tmp = (unsigned long *)(stat_node_num + nr_node_ids); + stat_sys_time_tmp = stat_sys_num_tmp + NR_syscalls; + stat_sys_time_tmp_block = stat_sys_time_tmp + NR_syscalls; + + /* 2 * DISK_TOTAL_NUM * sizeof(unsigned long) */ + stat_bio_disk_num = stat_sys_time_tmp_block + NR_syscalls; + stat_bio_disk_blocknum = stat_bio_disk_num + DISK_TOTAL_NUM; + + len = (TRACK_SYSCALL_NUM + HOOK_FUNC_NUM) * sizeof(struct func_latency); + data_ptr = (struct func_latency *)kzalloc(len, GFP_KERNEL); + if (data_ptr == NULL) + goto err_data_ptr; + data_range_k.start = (unsigned long)data_ptr; + data_range_k.end = (unsigned long)data_ptr + len; + + func_data = data_ptr + TRACK_SYSCALL_NUM; + + /* 3 * nr_cpu_ids * NR_syscalls * sizeof(unsigned long) */ + memset(stat_sys_num[0], 0, 3 * nr_cpu_ids * NR_syscalls * sizeof(unsigned long)); + memset(stat_cpu_idle, 0, 21 * nr_cpu_ids * sizeof(unsigned long)); + memset(stat_mem_node, 0, 2 * nr_node_ids * sizeof(int)); + memset(stat_bio_disk_num, 0, 2 * DISK_TOTAL_NUM * sizeof(unsigned long)); + + INIT_WORK(&ftrace_work, ftrace_unhook_work_fn); + INIT_WORK(&ftrace_work_init, ftrace_hook_work_fn); + return 0; +err_data_ptr: + vfree(stat_sys_num); + return -ENOMEM; +} + +int data_exit(void) +{ + if (stat_sys_num) + vfree(stat_sys_num); + if (data_ptr) + kfree(data_ptr); + if (stat_one_func_time) + vfree(stat_one_func_time); + + return 0; +} + diff --git a/ops/os_stat/os_stat/data_aware.h b/ops/os_stat/os_stat/data_aware.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f3158beb4aef4d1f9defc0505916e301f462b4 --- /dev/null +++ b/ops/os_stat/os_stat/data_aware.h @@ -0,0 +1,202 @@ +#ifndef _NUMA_AWARE_H +#define _NUMA_AWARE_H + +#include "syms.h" +#include +#include +#ifndef TK2 +#include +#include +#endif +#include "version.h" +#include "kprobe_prehook.h" +#include "./include/kernel/sched/sched.h" +#ifdef CONFIG_X86 +#include "./include/include/generated/asm-offsets.h" +#include "./include/include//linux/nospec.h" +#endif + +#ifdef TK5 +#include "./include/kernel/module/internal.h" +#endif +#define PRINT_SYSCALL_NUM 12 +#define PRINT_MORE_NUM 13 +#define DISK_SDA 8 +#define DISK_DM 251 +#define DISK_VD 253 +#define DISK_NVME 259 +#define DISK_HDD 65 +#define DISK_HDD1 66 +#define DISK_HDD2 67 +#define DISK_HDD3 68 +#define DISK_SDA_NUM 40 +#define DISK_HDD_NUM 60 +#define DISK_HDD_SUBNUM 15 +#define DISK_DM_NUM 16 +#define DISK_VD_NUM 80 +#define DISK_NVME_NUM 80 +#define DISK_TOTAL_NUM (DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM + DISK_NVME_NUM) +#define TRACK_SYSCALL_NUM (PRINT_SYSCALL_NUM + PRINT_MORE_NUM + DISK_TOTAL_NUM) + +#define STAT_SYSCALL_TYPE 0X01 +#define STAT_FUNC_TYPE 0X02 +#define STAT_PAGEFAULT_TYPE 0X04 + +struct func_latency { + unsigned long nr; + unsigned long num; + unsigned long latency; + unsigned long block_latency; + char func[NAME_MAX]; +}; + +struct address_range { + unsigned long start; + unsigned long end; +}; +#ifdef TK2 +typedef struct { + unsigned long total_numa_faults; + unsigned long numa_faults_locality[3]; + unsigned long numa_migrate_retry; + u64 node_stamp; /* migration stamp */ + unsigned long numa_pages_migrated; +} os_aware_t; +#define tk2_prev_ptr ((os_aware_t *)(prev->node_stamp)) +#define tk2_curr_ptr ((os_aware_t *)(current->node_stamp)) +#define tk2_next_ptr ((os_aware_t *)(next->node_stamp)) +#define tk2_p_ptr ((os_aware_t *)(p->node_stamp)) +#else +#define tk2_ptr prev +#define tk2_curr_ptr current +#define tk2_next_ptr next +#define tk2_p_ptr p +#endif + +extern unsigned int sysctl_module_enable; +extern unsigned int sysctl_module_debug; +extern unsigned int sysctl_trace_type; +extern unsigned int sysctl_module_disable; +extern unsigned int sysctl_module_monitor_sampling_rate; +extern unsigned int sysctl_module_print; +extern struct func_latency sysctl_data[TRACK_SYSCALL_NUM]; +extern unsigned int sysctl_module_offset_enable; +extern unsigned int sysctl_module_offset1; +extern unsigned int sysctl_module_offset2; +extern unsigned int sysctl_module_offset3; +extern unsigned int sysctl_module_which_parameter; +extern char sysctl_module_process_comm[NAME_MAX]; +extern unsigned int message_start; +extern unsigned int message_end; +extern unsigned int message_total_index; +extern unsigned int sysctl_module_block_enable; +extern unsigned long **stat_sys_num; +extern unsigned long **stat_sys_time; +extern unsigned long **stat_sys_time_block; +extern unsigned long *stat_func_num; +extern unsigned long *stat_func_time; +extern unsigned long *stat_func_time_block; +extern struct func_latency *message_info_stat; +extern struct percpu_counter ftrace_patch_num; +extern int hook_count; + +extern struct page *stat_alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask); +extern struct task_struct * stat_pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags_stat *rf); +extern blk_qc_t stat_submit_bio(struct bio *bio); +extern void stat_bio_endio(struct bio *bio); +extern void *stat__kmalloc(size_t size, gfp_t flags); +extern void *stat__kmalloc_node(size_t size, gfp_t flags, int node); +extern void *stat_kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags); +#ifdef TK5 +bool stat_blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets); +#else +extern bool stat_blk_mq_get_driver_tag(struct request *rq); +#endif +extern int stat_access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +extern void stat_stat_syscall_enter(struct kret_data *data); +extern void stat_stat_syscall_exit(unsigned long nr, struct kret_data *data); +extern void stat_sched_rqm_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next); +extern struct rq *stat_finish_task_switch(struct task_struct *prev); + +#ifdef CONFIG_X86_64 +extern void stat_do_syscall_64(unsigned long nr, struct pt_regs *regs); +#ifdef TK5 +extern void stat_handle_level_irq(struct irq_desc *desc); +extern void stat_handle_fasteoi_irq(struct irq_desc *desc); +extern void stat_handle_edge_irq(struct irq_desc *desc); +extern void stat_handle_simple_irq(struct irq_desc *desc); +#else +extern unsigned int stat_do_IRQ(struct pt_regs *regs); +#endif +#endif +#ifdef TK2 +#ifdef CONFIG_X86_64 +extern unsigned long rdtsc(void); +#endif +extern void stat__sched_fork(struct task_struct *p); +extern void stat_do_exit(long code); +#endif +#ifdef CONFIG_ARM64 +extern unsigned long rdtsc(void); +extern void stat_gic_handle_irq(struct pt_regs *regs); +#endif + +extern void stat_do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address); +extern void stat_no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code); +extern void stat_do_general_protection(struct pt_regs *regs, long error_code); +extern void stat_do_divide_error(struct pt_regs *regs, long error_code); +extern void get_info(int len, +#ifdef TK5 + void *buffer); +#else + void __user *buffer); +#endif +extern void stat_syscall_return_slowpath(struct pt_regs *regs); +extern void stat_func_enter(struct kret_data *data); +extern void stat_func_exit(struct kret_data *data); +extern void set_sys_nr(unsigned long nr); +extern unsigned long get_sys_nr(void); +extern void set_func_type(int pos); +extern unsigned long get_func_type(void); +extern void clr_func_type(int pos); +extern void unload_disable_module(void); +extern void stat_do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); +extern bool module_is_enable(void); +#ifdef TK5 +extern vm_fault_t stat_handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs); +extern void stat_psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); +#endif +extern void stat_rcu_note_context_switch(bool preempt); + +extern void set_sys_nr(unsigned long nr); +extern unsigned long get_sys_nr(void); +extern void set_func_type(int pos); +extern unsigned long get_func_type(void); +extern void clr_func_type(int pos); +extern void stat_time_start(void); +extern void stat_time_finish(void); +extern void context_check_start(void); +extern void context_check_end(void); +extern int alloc_buffer_for_stat(int hook_count); +extern void vfree_buffer(void); +extern bool context_check(void); +extern void context_exit(void); +extern inline bool is_in_range_v(unsigned long addr, char *func, int line); +extern inline bool is_in_range_k(unsigned long addr, char *func, int line); +extern inline bool is_in_range(unsigned long addr, char *func, int line); +#ifdef CONFIG_ARM64 +unsigned long rdtsc(void) +#endif +extern bool get_parameter(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); + +extern int data_init(void); +extern int data_exit(void); + +#endif diff --git a/ops/os_stat/os_stat/ftrace_hook.c b/ops/os_stat/os_stat/ftrace_hook.c new file mode 100644 index 0000000000000000000000000000000000000000..464cfe3ef509a2d2a261416703ade41aa5cd7a07 --- /dev/null +++ b/ops/os_stat/os_stat/ftrace_hook.c @@ -0,0 +1,1085 @@ +/* + * Kernel dynamic hooks based on ftrace + * aurelianliu@tencent.com + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "version.h" +#include "hook.h" +#include "data_aware.h" +#include "kprobe_prehook.h" +#include "scene_layer.h" + +#ifdef CONFIG_FUNCTION_TRACER +#define CC_USING_FENTRY +#endif + +DEFINE_MUTEX(hook_func); +struct ftrace_verify_func +{ + char *modname; + struct module *mod; + char name[NAME_MAX]; + unsigned long old_addr; + unsigned long old_offset; + unsigned long new_addr; + unsigned long new_offset; +}; + +struct ftrace_ksym +{ + int type; + char name[NAME_MAX]; + void **address; +}; + +enum ksym_type { + KSYM_DEF, + KSYM_JMP_MCOUNT +}; + +struct percpu_counter ftrace_patch_num; +char ftrace_hook_name[NAME_MAX]; +struct work_struct ftrace_work; +struct work_struct ftrace_work_init; +unsigned long stat_func_total_time[HOOK_FUNC_NUM]; +unsigned long stat_func_total_num[HOOK_FUNC_NUM]; +unsigned long stat_hook_function = 1; +static bool stat_buffer_init; +unsigned long (*p__test_func)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +#define __test_func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) \ + p__test_func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) +char func_pointer[NAME_MAX]; +char func_pointer_name[NAME_MAX]; +char printk_name_first[NAME_MAX]; +char printk_name_last[NAME_MAX]; +char show_parameter_val[NAME_MAX]; +char show_parameter_type[NAME_MAX]; +int hook_count; +static int hook_var_count; +unsigned long test_func(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8) +{ + ssize_t ret; + bool run, done; + + if (context_check()) + goto next; + print_func_name((void *)arg1, (void *)arg2, (void *)arg3, func_pointer, func_pointer_name); + print_info((void *)arg1, (void *)arg2, (void *)arg3); + run = get_parameter(arg1, arg2, arg3, arg4, arg5, arg6); + catch_kill_signal(arg1, arg2, arg3); + + /* for each scene adjust, such as fs/net io */ + scene_before(arg1, arg2, arg3, arg4, arg5, arg6); + + stat_func_enter(NULL); + context_check_end(); +next: + ret = do_scene(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, &done, 0); + if (!done) + ret = __test_func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + context_check_start(); + stat_func_exit(NULL); + + if (!run) + get_parameter(arg1, arg2, arg3, arg4, arg5, arg6); + /* whole sample, not set 0 */ + if (sysctl_module_monitor_sampling_rate != 1) + stat_hook_function = 0; + + /* stat latency between test_func1 and test_func */ + if (sysctl_trace_type == 1) + save_total_time(1, false); + + /* for each scene adjust, such as fs/net io */ + scene_after(arg1, arg2, arg3, arg4, arg5, ret); + + context_exit(); + + return ret; +} +unsigned long print_before_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + ret = scene_before_1(arg1, arg2, arg3, arg4, arg5, arg6); + save_start_time(1); + return 0; + +} +void print_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, ssize_t ret, unsigned long test) +{ + scene_after_1(arg1, arg2, arg3, arg4, arg5, ret, test); + save_total_time(1, true); +} +unsigned long print_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + ret = scene_before_2(arg1, arg2, arg3, arg4, arg5, arg6); + save_start_time(2); + return 0; +}; +void print_after_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, ssize_t ret, unsigned long test) +{ + scene_after_2(arg1, arg2, arg3, arg4, arg5, ret, test); + save_total_time(2, true); +}; +unsigned long print_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + ret = scene_before_3(arg1, arg2, arg3, arg4, arg5, arg6); + save_start_time(3); + return 0; +}; +void print_after_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, ssize_t ret, unsigned long test) +{ + scene_after_3(arg1, arg2, arg3, arg4, arg5, ret, test); + save_total_time(3, true); +}; + +unsigned long print_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + ret = scene_before_4(arg1, arg2, arg3, arg4, arg5, arg6); + + save_start_time(4); + return 0; +}; +void print_after_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, ssize_t ret, unsigned long test) +{ + scene_after_4(arg1, arg2, arg3, arg4, arg5, ret, test); + save_total_time(4, true); +}; +unsigned long print_before_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + ret = scene_before_4(arg1, arg2, arg3, arg4, arg5, arg6); + + save_start_time(5); + return ret; +}; +void print_after_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, ssize_t ret, unsigned long test) +{ + scene_after_5(arg1, arg2, arg3, arg4, arg5, ret, test); + save_total_time(5, true); +}; + +#define DEFINE_TEST_PRINT(name) \ +unsigned long print_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6) \ +{ \ +\ + unsigned long ret; \ +\ + ret = scene_before_##name(arg1, arg2, arg3, arg4, arg5, arg6); \ + save_start_time(name); \ + return ret; \ +};\ +\ +void print_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, ssize_t ret, unsigned long test) \ +{ \ + scene_after_##name(arg1, arg2, arg3, arg4, arg5, ret, test); \ + save_total_time(name, true); \ +}; + +#define DEFINE_TEST(name) \ +unsigned long *p_test_var_##name; \ +unsigned long (*p__test_func##name)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); \ +unsigned long test_func##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8) \ +{\ + ssize_t ret; \ + unsigned long test; \ +\ + if (context_check()) \ + goto next; \ +\ + test = print_before_##name(arg1, arg2, arg3, arg4, arg5, arg6); \ + context_check_end(); \ +next:\ + ret = do_scene(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, &done, name); \ + if (!done) \ + ret = p__test_func##name(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + context_check_start(); \ +\ + print_after_##name(arg1, arg2, arg3, arg4, arg5, ret, test); \ +\ + context_exit(); \ + \ + return ret; \ +} + +//HOOK_FUNC_NUM 50 +DEFINE_TEST_PRINT(6) +DEFINE_TEST_PRINT(7) +DEFINE_TEST_PRINT(8) +DEFINE_TEST_PRINT(9) +DEFINE_TEST_PRINT(10) +DEFINE_TEST_PRINT(11) +DEFINE_TEST_PRINT(12) +DEFINE_TEST_PRINT(13) +DEFINE_TEST_PRINT(14) +DEFINE_TEST_PRINT(15) +DEFINE_TEST_PRINT(16) +DEFINE_TEST_PRINT(17) +DEFINE_TEST_PRINT(18) +DEFINE_TEST_PRINT(19) +DEFINE_TEST_PRINT(20) +DEFINE_TEST_PRINT(21) +DEFINE_TEST_PRINT(22) +DEFINE_TEST_PRINT(23) +DEFINE_TEST_PRINT(24) +DEFINE_TEST_PRINT(25) +DEFINE_TEST_PRINT(26) +DEFINE_TEST_PRINT(27) +DEFINE_TEST_PRINT(28) +DEFINE_TEST_PRINT(29) +DEFINE_TEST_PRINT(30) +DEFINE_TEST_PRINT(31) +DEFINE_TEST_PRINT(32) +DEFINE_TEST_PRINT(33) +DEFINE_TEST_PRINT(34) +DEFINE_TEST_PRINT(35) +DEFINE_TEST_PRINT(36) +DEFINE_TEST_PRINT(37) +DEFINE_TEST_PRINT(38) +DEFINE_TEST_PRINT(39) +DEFINE_TEST_PRINT(40) +DEFINE_TEST_PRINT(41) +DEFINE_TEST_PRINT(42) +DEFINE_TEST_PRINT(43) +DEFINE_TEST_PRINT(44) +DEFINE_TEST_PRINT(45) +DEFINE_TEST_PRINT(46) +DEFINE_TEST_PRINT(47) +DEFINE_TEST_PRINT(48) +DEFINE_TEST_PRINT(49) +DEFINE_TEST_PRINT(50) + + +DEFINE_TEST(1) +DEFINE_TEST(2) +DEFINE_TEST(3) +DEFINE_TEST(4) +DEFINE_TEST(5) +DEFINE_TEST(6) +DEFINE_TEST(7) +DEFINE_TEST(8) +DEFINE_TEST(9) +DEFINE_TEST(10) +DEFINE_TEST(11) +DEFINE_TEST(12) +DEFINE_TEST(13) +DEFINE_TEST(14) +DEFINE_TEST(15) +DEFINE_TEST(16) +DEFINE_TEST(17) +DEFINE_TEST(18) +DEFINE_TEST(19) +DEFINE_TEST(20) +DEFINE_TEST(21) +DEFINE_TEST(22) +DEFINE_TEST(23) +DEFINE_TEST(24) +DEFINE_TEST(25) +DEFINE_TEST(26) +DEFINE_TEST(27) +DEFINE_TEST(28) +DEFINE_TEST(29) +DEFINE_TEST(30) +DEFINE_TEST(31) +DEFINE_TEST(32) +DEFINE_TEST(33) +DEFINE_TEST(34) +DEFINE_TEST(35) +DEFINE_TEST(36) +DEFINE_TEST(37) +DEFINE_TEST(38) +DEFINE_TEST(39) +DEFINE_TEST(40) +DEFINE_TEST(41) +DEFINE_TEST(42) +DEFINE_TEST(43) +DEFINE_TEST(44) +DEFINE_TEST(45) +DEFINE_TEST(46) +DEFINE_TEST(47) +DEFINE_TEST(48) +DEFINE_TEST(49) +DEFINE_TEST(50) + +static int hook_index[HOOK_FUNC_NUM]; +/* + * ksyms init + */ +static struct ftrace_ksym syms[] = { + {KSYM_JMP_MCOUNT, "vfs_read", (void **)&p__test_func}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func1}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func2}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func3}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func4}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func5}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func6}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func7}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func8}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func9}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func10}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func11}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func12}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func13}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func14}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func15}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func16}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func17}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func18}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func19}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func20}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func21}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func22}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func23}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func24}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func25}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func26}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func27}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func28}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func29}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func30}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func31}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func32}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func33}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func34}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func35}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func36}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func37}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func38}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func39}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func40}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func41}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func42}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func43}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func44}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func45}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func46}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func47}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func48}, + {KSYM_JMP_MCOUNT, "p__test_func_tmp", (void **)&p__test_func49}, +}; +static struct ftrace_ksym syms_var[] = { + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_1}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_2}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_3}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_4}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_5}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_6}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_7}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_8}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_9}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_10}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_11}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_12}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_13}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_14}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_15}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_16}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_17}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_18}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_19}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_20}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_21}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_22}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_23}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_24}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_25}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_26}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_27}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_28}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_29}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_30}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_31}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_32}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_33}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_34}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_35}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_36}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_37}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_38}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_39}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_40}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_41}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_42}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_43}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_44}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_45}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_46}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_47}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_48}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_49}, + {KSYM_DEF, "p_test_var_tmp", (void **)&p_test_var_50}, +}; + +int ftrace_ksyms_init(struct ftrace_ksym syms[50]) +{ + unsigned long addr; + int i = 0; + int count = HOOK_FUNC_NUM; + + /* Init kernel symbols */ + while (true) { + if (i >= count || strstr(syms[i].name, "p__test_func_tmp") || strstr(syms[i].name, "p_test_var_tmp")) + break; + +#ifdef TK5 + addr = kallsyms_lookup_name_tk5(syms[i].name); +#else + addr = kallsyms_lookup_name(syms[i].name); +#endif + if (!addr) { + pr_err("symbol %s not found, %d\n", syms[i].name, i); + + if (hook_count > 0) { + i++; + continue; + } + + return -ENODEV; + } + pr_info("ksyms:symbol %s found, %d, %lx\n", syms[i].name, i, addr); + + switch (syms[i].type) { + case KSYM_DEF: + syms[i++].address[0] = (void *)addr; + break; + case KSYM_JMP_MCOUNT: + syms[i++].address[0] = (void *)(addr + MCOUNT_INSN_SIZE); + break; + default: + pr_err("symbol %s invalid type %d\n", syms[i].name, syms[i].type); + return -EINVAL; + } + } + return 0; +} + +/* + * Patched functions + */ +#ifdef CC_USING_FENTRY +static unsigned long verify_funcs_stat[51]; +static struct ftrace_verify_func verify_funcs[] = +{ + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func1, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func2, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func3, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func4, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func5, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func6, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func7, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func8, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func9, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func10, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func11, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func12, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func13, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func14, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func15, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func16, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func17, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func18, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func19, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func20, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func21, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func22, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func23, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func24, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func25, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func26, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func27, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func28, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func29, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func30, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func31, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func32, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func33, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func34, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func35, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func36, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func37, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func38, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func39, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func40, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func41, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func42, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func43, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func44, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func45, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func46, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func47, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func48, 0}, + {NULL, NULL, "test_func_tmp", 0, 0, (unsigned long)test_func49, 0}, +}; + //{NULL, NULL, "do_syscall_64", 0, 0, (unsigned long)stat_do_syscall_64, 0}, + +#else + +#error "error, compiler donot support fentry?"; + +#endif + +unsigned long (*scene_func[HOOK_FUNC_NUM + 1])(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8) = { +test_func, test_func1, test_func2, test_func3, test_func4, test_func5, test_func6, test_func7, +test_func8, test_func9, test_func10, test_func11, test_func12, test_func13, test_func14, test_func15, +test_func16, test_func17, test_func18, test_func19, test_func20, test_func21, test_func22, test_func23, +test_func24, test_func25, test_func26, test_func27, test_func28, test_func29, test_func30, test_func31, +test_func32, test_func33, test_func34, test_func35, test_func36, test_func37, test_func38, test_func39, +test_func40, test_func41, test_func42, test_func43, test_func44, test_func45, test_func46, test_func47, +test_func48, test_func49, test_func50}; + +unsigned long (**scene_func_body[HOOK_FUNC_NUM + 1])(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8) = { +&p__test_func, &p__test_func1, &p__test_func2, &p__test_func3, &p__test_func4, &p__test_func5, &p__test_func6, &p__test_func7, +&p__test_func8, &p__test_func9, &p__test_func10, &p__test_func11, &p__test_func12, &p__test_func13, &p__test_func14, &p__test_func15, +&p__test_func16, &p__test_func17, &p__test_func18, &p__test_func19, &p__test_func20, &p__test_func21, &p__test_func22, &p__test_func23, +&p__test_func24, &p__test_func25, &p__test_func26, &p__test_func27, &p__test_func28, &p__test_func29, &p__test_func30, &p__test_func31, +&p__test_func32, &p__test_func33, &p__test_func34, &p__test_func35, &p__test_func36, &p__test_func37, &p__test_func38, &p__test_func39, +&p__test_func40, &p__test_func41, &p__test_func42, &p__test_func43, &p__test_func44, &p__test_func45, &p__test_func46, &p__test_func47, +&p__test_func48, &p__test_func49, &p__test_func50}; + +unsigned long **scene_var[HOOK_FUNC_NUM] = { +&p_test_var_1, &p_test_var_2, &p_test_var_3, &p_test_var_4, &p_test_var_5, &p_test_var_6, &p_test_var_7, &p_test_var_8, +&p_test_var_19, &p_test_var_10, &p_test_var_11, &p_test_var_12, &p_test_var_13, &p_test_var_14, &p_test_var_15, &p_test_var_16, +&p_test_var_17, &p_test_var_18, &p_test_var_19, &p_test_var_20, &p_test_var_21, &p_test_var_22, &p_test_var_23, &p_test_var_24, +&p_test_var_25, &p_test_var_26, &p_test_var_27, &p_test_var_28, &p_test_var_29, &p_test_var_30, &p_test_var_31, &p_test_var_32, +&p_test_var_33, &p_test_var_34, &p_test_var_35, &p_test_var_36, &p_test_var_37, &p_test_var_38, &p_test_var_39, &p_test_var_40, +&p_test_var_41, &p_test_var_42, &p_test_var_43, &p_test_var_44, &p_test_var_45, &p_test_var_46, &p_test_var_47, &p_test_var_48, +&p_test_var_49, &p_test_var_50}; + +bool check_func_name(char *func) +{ + return strstr(verify_funcs[0].name, func); +} + +int get_one_func_count(void) +{ + return hook_count + 1; +} +char *get_one_func_name(int i) +{ + if (i > hook_count) + return "NULL"; + return verify_funcs[i].name; +} + +func_type get_one_func(char *name) +{ + int i; + + for (i = 0; i <= hook_count; i++) + if (strstr(verify_funcs[i].name, name)) + break; + if (i <= hook_count) + return scene_func[i]; + + return NULL; +} + +func_body_type get_one_func_body(char *name) +{ + int i; + + for (i = 0; i <= hook_count; i++) + if (strstr(verify_funcs[i].name, name)) + break; + if (i <= hook_count) + return scene_func_body[i]; + + return NULL; +} + +unsigned long **get_one_var(char *name) +{ + int i; + + for (i = 0; i <= hook_count; i++) + if (strstr(syms_var[i].name, name)) + break; + if (i <= hook_count) + return scene_var[i]; + + return NULL; +} + +static struct module *get_link_module(char *modname) +{ + struct module *mod; + +#ifdef TK5 + mutex_lock(module_mutex_tk5); + mod = find_module_tk5(modname); +#else + mutex_lock(&module_mutex); + mod = find_module(modname); +#endif + if (!mod) { +#ifdef TK5 + mutex_unlock(module_mutex_tk5); +#else + mutex_unlock(&module_mutex); +#endif + return NULL; + } + + WARN_ON(!try_module_get(mod)); +#ifdef TK5 + mutex_unlock(module_mutex_tk5); +#else + mutex_unlock(&module_mutex); +#endif + + return mod; +} + +static inline void put_link_modules(struct module *mod) +{ + if (mod) + module_put(mod); +} + +static int verify_kernel(void) +{ + int i, count = ARRAY_SIZE(verify_funcs); + char name[256]; + int status = -ENXIO; + + for (i = 0; i < count; i++) { + if (verify_funcs[i].modname) { + verify_funcs[i].mod = get_link_module(verify_funcs[i].modname); + if (!verify_funcs[i].mod){ + pr_err("unable to find module '%s'\n", verify_funcs[i].modname); + return -ENXIO; + } + snprintf(name, sizeof(name), "%s:%s",verify_funcs[i].modname, verify_funcs[i].name); + } else { + if(strstr(verify_funcs[i].name, "test_func_tmp")) + continue; + strcpy(name, verify_funcs[i].name); + } + + /* check symbol */ +#ifdef TK5 + verify_funcs[i].old_addr = kallsyms_lookup_name_tk5(name); +#else + verify_funcs[i].old_addr = kallsyms_lookup_name(name); +#endif + if (!verify_funcs[i].old_addr) { + pr_err("unable to find symbol '%s'\n", name); + if (hook_count > 0) { + strcpy(verify_funcs[i].name, "test_func_tmp"); + continue; + } + return -ENXIO; + } + status = 0; + pr_info("find symbol '%s', %lx\n", name, verify_funcs[i].old_addr); + } + return status; +} + +static struct ftrace_verify_func *tpatch_find_match_ip(unsigned long ip) +{ + int count = ARRAY_SIZE(verify_funcs); + int i; + + if (register_ftrace_ftrace != FTRACE_REGISTER) + return NULL; + + for ( i =0; i < count ; i++) { + if (ip == verify_funcs[i].old_addr + verify_funcs[i].old_offset) { + if (verify_funcs_stat[i]++ % sysctl_module_monitor_sampling_rate != 0) + return NULL; + stat_hook_function = 1; + return &verify_funcs[i]; + } + } + + return NULL; +} + +/* Update regs->ip to tell ftrace to return + * to the new function.*/ +#ifdef TK5 +static void notrace tpatch_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct ftrace_regs *regs) +#else +static void notrace tpatch_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct pt_regs *regs) +#endif +{ + struct ftrace_verify_func *func; + + preempt_disable_notrace(); + +#ifdef CONFIG_ARM64 + /* + * In arm64, the compiler has inserted two NOPs before the regular + * function prologue. + * + * At runtime we want to be able to swing a single NOP <-> BL to enable or + * disable the ftrace call. The BL requires us to save the original LR value, + * so here ftrace insert a over the first NOP so the instructions + * before the regular prologue are: + * + * | Compiled | Disabled | Enabled | + * +----------+------------+------------+ + * | NOP | MOV X9, LR | MOV X9, LR | + * | NOP | NOP | BL | <== this is ip/pc actually position. + * + * When ftrace call into this fuction, ip/pc is actually at the second + * instruction. Therefore, we need to decrease ip/pc with 1 * AARCH64_INSN_SIZE + * to get correct function address. + */ + ip -= AARCH64_INSN_SIZE; +#endif + + func = tpatch_find_match_ip(ip); + if (func) +#ifdef CONFIG_X86 +#ifdef TK5 + regs->regs.ip = func->new_addr + MCOUNT_INSN_SIZE; +#else + regs->ip = func->new_addr + MCOUNT_INSN_SIZE; +#endif +#elif defined(CONFIG_ARM64) +#ifdef TK5 + regs->regs.pc = func->new_addr; +#else + regs->pc = func->new_addr; +#endif +#endif + + preempt_enable_notrace(); +} + +static struct ftrace_ops tpatch_ftrace_ops __read_mostly = { + .func = tpatch_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static int patch_kernel(void) +{ + int count = ARRAY_SIZE(verify_funcs); + int ret, i = 0; + + for ( i =0; i < count ; i++) { + if(strstr(verify_funcs[i].name, "test_func_tmp")) + goto put_module; + ret = ftrace_set_filter(&tpatch_ftrace_ops, verify_funcs[i].name, strlen(verify_funcs[i].name), 0); + if (ret < 0) { + pr_err("can't set ftrace filter func:%s at address 0x%lx, ret(%d)\n", verify_funcs[i].name, + verify_funcs[i].old_addr, ret); + if (hook_count > 0) { + hook_index[i] = i + 1; + continue; + } + goto error; + } + hook_index[i] = 0; + +put_module: + /* put module */ + put_link_modules(verify_funcs[i].mod); + } + + ret = register_ftrace_function(&tpatch_ftrace_ops); + if (ret < 0) { + pr_err("can't register ftrace handler\n"); + goto error; + } + + return 0; + + error: + for (; i < count; i++) + put_link_modules(verify_funcs[i].mod); + + return ret; +} + +int ftrace_patch_init(char *name) +{ + int ret, ret_new, ret_success, i; + + if (!name) { + pr_err("name null\n"); + return -ENODEV; + } + + mutex_lock(&hook_func); + if (register_ftrace_ftrace == FTRACE_UNREGISTERING) { + while (register_ftrace_ftrace == FTRACE_UNREGISTERING) + schedule_timeout(1000); + } + + if (register_ftrace_ftrace == FTRACE_REGISTER) { + mutex_unlock(&hook_func); + pr_err("need unregister ftrace hook function:maybe:%s; or: %s", ftrace_hook_name, symbol_new); + return -EPERM; + } + register_ftrace_ftrace = FTRACE_INIT; + tpatch_ftrace_ops.func = tpatch_ftrace_handler; + tpatch_ftrace_ops.flags = FTRACE_OPS_FL_SAVE_REGS; + + ret = alloc_buffer_for_stat(hook_count); + if (ret < 0) { + mutex_unlock(&hook_func); + return ret; + } + + init_scene(); + pr_info("%s, %s, %s, %d", __func__, verify_funcs[hook_count].name, name, hook_count); + + strcpy(verify_funcs[0].name, name); + strcpy(syms[0].name, name); + + ftrace_ksyms_init(syms); + ftrace_ksyms_init(syms_var); + /* Verify patched functions */ + ret = verify_kernel(); + if(ret < 0) { + mutex_unlock(&hook_func); + vfree_buffer(); + pr_err("Incorrect kernel, or function not found\n"); + return -ENODEV; + } + +#if defined(TK2) && !defined(KVM3) + percpu_counter_init(&ftrace_patch_num, 0); +#else + percpu_counter_init(&ftrace_patch_num, 0, GFP_KERNEL); +#endif + /* Ok, try to replace target functions */ + ret = patch_kernel(); + if (ret < 0) { + percpu_counter_destroy(&ftrace_patch_num); + mutex_unlock(&hook_func); + vfree_buffer(); + return ret; + } + ret_success = -1; + for (i = 0; i < HOOK_FUNC_NUM; i++) { + if (hook_index[i] == 0) { + ret_success = ret; + continue; + } + ret_new = register_kret_new_func_batch(i + 1, verify_funcs[i].name); + if (ret_new < 0) + hook_index[i] = 0; + else + ret_success = ret_new; + } + + if (ret_success > 0) + ret = ret_success; + register_ftrace_ftrace = FTRACE_REGISTER; + mutex_unlock(&hook_func); + return ret; +} + +void ftrace_patch_exit(void) +{ + int count = ARRAY_SIZE(verify_funcs) - 1, i; + int count_syms = ARRAY_SIZE(syms) - 1; + + if (register_ftrace_ftrace != FTRACE_UNREGISTERING) + goto out; + + stat_buffer_init = false; + /* Destroy ftrace filter */ + unregister_ftrace_function(&tpatch_ftrace_ops); + memset(&tpatch_ftrace_ops, 0, sizeof(tpatch_ftrace_ops)); + synchronize_rcu(); + + /* Wait all exit patched function */ + while (percpu_counter_sum(&ftrace_patch_num)) + msleep(1); + + percpu_counter_destroy(&ftrace_patch_num); + + for (i = 0; i < HOOK_FUNC_NUM; i++) { + if (hook_index[i] == 0) + continue; + unregister_kret_new_func_batch(i); + } + + for (i = 1; i <= count; i++) + strcpy(verify_funcs[i].name, "test_func_tmp"); + for (i = 1; i <= count_syms; i++) + strcpy(syms[i].name, "p__test_func_tmp"); + for (i = 0; i < HOOK_FUNC_NUM; i++) { + if (hook_index[i] != 0) + hook_index[i] = 0; + } + + hook_count = 0; + register_ftrace_ftrace = FTRACE_INIT; + exit_scene(); + vfree_buffer(); +out: + return; +} + +int sysctl_ftrace_hook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + if (register_ftrace_ftrace == FTRACE_REGISTER_FAILED) + register_ftrace_ftrace = FTRACE_INIT; + schedule_work_on(0, &ftrace_work_init); + + return 0; +} + +int sysctl_ftrace_func_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int count = ARRAY_SIZE(verify_funcs) - 1; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + if (hook_count >= count) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + hook_count++; + strcpy(verify_funcs[hook_count].name, ftrace_hook_name); + strcpy(syms[hook_count].name, ftrace_hook_name); + + return 0; +} + +int sysctl_ftrace_var_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int count = ARRAY_SIZE(syms_var) - 1; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + if (hook_count >= count) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + strcpy(syms_var[hook_var_count].name, ftrace_hook_name); + hook_var_count++; + + return 0; +} + +void ftrace_hook_work_fn(struct work_struct *work) +{ + int ret; + + ret = ftrace_patch_init(ftrace_hook_name); + if (ret < 0 && register_ftrace_ftrace != FTRACE_REGISTER) + register_ftrace_ftrace = FTRACE_REGISTER_FAILED; + return; +} + +void ftrace_unhook_work_fn(struct work_struct *work) +{ + int i; + + mutex_lock(&hook_func); + + if (register_ftrace_ftrace != FTRACE_REGISTER) + goto out; + + hook_count = 0; + + sysctl_module_offset_enable = 0; + register_ftrace_ftrace = FTRACE_UNREGISTERING; + ftrace_patch_exit(); + +out: + mutex_unlock(&hook_func); + + return; +} +int sysctl_ftrace_unhook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + schedule_work_on(0, &ftrace_work); + + return ret; +} diff --git a/ops/os_stat/os_stat/func_pointer_table.c b/ops/os_stat/os_stat/func_pointer_table.c new file mode 100644 index 0000000000000000000000000000000000000000..c4639e070e2f3004eb57669be3e9ddab68c56809 --- /dev/null +++ b/ops/os_stat/os_stat/func_pointer_table.c @@ -0,0 +1,604 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#if defined(TK4) || defined(TK4_NEW) +#include +#else +#include +#endif +#include +#include "kprobe_prehook.h" +#include "syms.h" +#include "./include/kernel/sched/sched.h" +#ifndef TK2 +#include "./include/drivers/target/target_core_file.h" +#include "./include/drivers/block/loop.h" +#endif + +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +extern unsigned int sysctl_module_debug; +struct shm_file_data { + int id; + struct ipc_namespace *ns; + struct file *file; + const struct vm_operations_struct *vm_ops; +}; +bool print_func_name(void *opt, void *opt2, void *opt3, char *pointer, char *func) +{ + bool ret = true; +#ifndef TK2 + if (sysctl_module_debug != FTRACE_CONTROL_POINTER) + return false; + + /***************** fs *****************/ + if ((check_func_name("vfs_read") || check_func_name("new_sync_read")) && strstr(pointer, "file->f_op->read_iter")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->read_iter); + return ret; + } + if ((check_func_name("vfs_write") || check_func_name("new_sync_write")) && strstr(pointer, "file->f_op->write_iter")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->write_iter); + return ret; + } + if (check_func_name("vfs_read") && strstr(pointer, "file->f_op->read")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->read); + return ret; + } + if (check_func_name("vfs_write") && strstr(pointer, "file->f_op->write")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->write); + return ret; + } + if (check_func_name("vfs_ioctl") && strstr(pointer, "filp->f_op->unlocked_ioctl")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->unlocked_ioctl); + return ret; + } + if (check_func_name("compat_blkdev_ioctl") && strstr(pointer, "disk->fops->compat_ioctl")) { + struct file *file = (struct file *)opt; + struct inode *inode = file->f_mapping->host; +#if defined(TK4) || defined(TK4_NEW) + struct block_device *bdev = inode->i_bdev; +#else + struct block_device *bdev = I_BDEV(file->f_mapping->host); +#endif + struct gendisk *disk = bdev->bd_disk; + sprintf(func, FUNCTION_PRINT_FORMAT, disk->fops->compat_ioctl); + return ret; + } + if (check_func_name("call_mmap") && strstr(pointer, "file->f_op->mmap")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->mmap); + return ret; + } + if (check_func_name("vfs_fsync_range") && strstr(pointer, "file->f_op->fsync")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->fsync); + return ret; + } + if ((check_func_name("do_splice_to")) && strstr(pointer, "splice_read")) { + struct file *in = (struct file *)opt; + if (in->f_op->splice_read) + sprintf(func, FUNCTION_PRINT_FORMAT, in->f_op->splice_read); + else + sprintf(func, "%s", "default_file_splice_read"); + return ret; + } + if ((check_func_name("do_splice_from")) && strstr(pointer, "splice_write")) { + struct file *out = (struct file *)opt; + if (out->f_op->splice_write) + sprintf(func, FUNCTION_PRINT_FORMAT, out->f_op->splice_write); + else + sprintf(func, "%s", "default_file_splice_write"); + return ret; + } + if (check_func_name("get_unmapped_area") && strstr(pointer, "file->f_op->fsync")) { + struct file *file = (struct file *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->fsync); + return ret; + } + if (strstr(pointer, "file->f_op->fallocate")) { + struct loop_device *lo = NULL; + struct se_cmd *cmd = NULL; + struct fd_dev *fd_dev = NULL; + struct file *file = NULL; + if (check_func_name("lo_fallocate")) + lo = (struct loop_device *)opt; + file = lo->lo_backing_file; + if (check_func_name("fd_execute_unmap")) { + cmd = (struct se_cmd *)opt; + fd_dev = container_of(cmd->se_dev, struct fd_dev, dev); + file = fd_dev->fd_file; + } + if (check_func_name("vfs_fallocate")) + file = (struct file *)opt; + if (file) + sprintf(func, FUNCTION_PRINT_FORMAT, file->f_op->fallocate); + return ret; + } + if (check_func_name("sfd->file->f_op->fallocate") && (strstr(pointer, "std->file->i_op->fallocate"))) { + struct file *file = NULL; + struct shm_file_data *sfd; + file = (struct file *)opt; + sfd = shm_file_data(file); + if (sfd->file->f_op->fallocate) + sprintf(func, FUNCTION_PRINT_FORMAT, sfd->file->f_op->fallocate); + return ret; + } + if (check_func_name("ioctl_fiemap") && (strstr(pointer, "inode->i_op->fiemap"))) { + struct inode *inode; + struct file *filp = (struct file *)opt; + inode = file_inode(filp); + if (inode->i_op->fiemap) + sprintf(func, FUNCTION_PRINT_FORMAT, inode->i_op->fiemap); + return ret; + } +#if 0 + if (check_func_name("ovl_fiemap") && (strstr(pointer, "realinode->i_op->fiemap"))) { + struct inode *inode = (struct inode *)opt; + struct inode *realinode = ovl_inode_real(inode); + if (realinode->i_op->fiemap) + sprintf(func, FUNCTION_PRINT_FORMAT, realinode->i_op->fiemap); + return; + } +#endif + sub_print_func_name(opt, opt2, opt3, pointer, func); + if (strstr(pointer, "mapping->a_ops->writepages")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping->a_ops->writepages) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->writepages); + return ret; + } + if ((check_func_name("pagecache_write_begin")) && strstr(pointer, "a_ops->write_begin")) { + struct address_space *mapping = (struct address_space *)opt2; + if (mapping && mapping->a_ops->write_begin) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->write_begin); + return ret; + } + if ((check_func_name("generic_perform_write")) && strstr(pointer, "a_ops->write_begin")) { + struct file *file = (struct file *)opt; + struct address_space *mapping = file->f_mapping; + if (mapping && mapping->a_ops->write_begin) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->write_begin); + return ret; + } + if ((check_func_name("pagecache_write_end")) && strstr(pointer, "a_ops->write_end")) { + struct address_space *mapping = (struct address_space *)opt2; + if (mapping && mapping->a_ops->write_end) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->write_end); + return ret; + } + if ((check_func_name("generic_perform_write")) && strstr(pointer, "a_ops->write_end")) { + struct file *file = (struct file *)opt; + struct address_space *mapping = file->f_mapping; + if (mapping && mapping->a_ops->write_end) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->write_end); + return ret; + } +#if 0 + if (strstr(pointer, "mapping->a_ops->set_page_dirty")) { + struct page *page = (struct page *)opt; + struct swap_info_struct *sis = page_swap_info(page); + struct address_space *mapping = NULL; + + if (sis && sis->swap_file) + mapping = sis->swap_file->f_mapping; + if (mapping && mapping->a_ops->writepages) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->set_page_dirty); + return; + } +#endif + if ((check_func_name("bmap")) && strstr(pointer, "inode->i_mapping->a_ops->bmap")) { + struct inode *inode = (struct inode *)opt; + if (inode->i_mapping->a_ops->bmap) + sprintf(func, FUNCTION_PRINT_FORMAT, inode->i_mapping->a_ops->bmap); + return ret; + } + if ((check_func_name("ioctl_fibmap")) && strstr(pointer, "mapping->a_ops->bmap")) { + struct file *filp = (struct file *)opt; + struct address_space *mapping = filp->f_mapping; + if (mapping->a_ops->bmap) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->bmap); + return ret; + } + if ((check_func_name("ioctl_fibmap")) && strstr(pointer, "mapping->a_ops->bmap")) { + struct file *filp = (struct file *)opt; + struct address_space *mapping = filp->f_mapping; + if (mapping->a_ops->bmap) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->bmap); + return ret; + } + if ((check_func_name("page_seek_hole_data")) && strstr(pointer, "ops->is_partially_uptodate")) { + struct inode *inode = (struct inode *)opt; + if (inode->i_mapping->a_ops->is_partially_uptodate) + sprintf(func, FUNCTION_PRINT_FORMAT, inode->i_mapping->a_ops->is_partially_uptodate); + return ret; + } + if ((check_func_name("do_writepages")) && strstr(pointer, "mapping->a_ops->writepages")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping->a_ops->writepages) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->writepages); + return ret; + } + if ((check_func_name("pageout")) && strstr(pointer, "mapping->a_ops->writepage")) { + struct address_space *mapping = (struct address_space *)opt2; + if (mapping->a_ops->writepage) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->writepage); + return ret; + } + if ((check_func_name("generic_writepages")) && strstr(pointer, "mapping->a_ops->writepage")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping->a_ops->writepage) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->writepage); + return ret; + } +#ifdef TK5 + if ((check_func_name("read_pages")) && strstr(pointer, "aops->read_folio")) { + struct readahead_control *rac = (struct readahead_control *)opt; + struct address_space_operations *a_ops = rac->mapping->a_ops; + if (a_ops && a_ops->read_folio) + sprintf(func, FUNCTION_PRINT_FORMAT, a_ops->read_folio); + return ret; + } + if ((check_func_name("read_pages")) && strstr(pointer, "aops->readahead")) { + struct readahead_control *rac = (struct readahead_control *)opt; + struct address_space_operations *a_ops = rac->mapping->a_ops; + if (a_ops && a_ops->readahead) + sprintf(func, FUNCTION_PRINT_FORMAT, a_ops->readahead); + return ret; + } +#else + if ((check_func_name("generic_file_buffered_read")) && strstr(pointer, "mapping->a_ops->readpage")) { + struct kiocb *iocb = (struct kiocb *)opt; + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + if (mapping->a_ops->readpage) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->readpage); + return ret; + } + if ((check_func_name("generic_file_buffered_read")) && strstr(pointer, "mapping->a_ops->is_partially_uptodate")) { + struct kiocb *iocb = (struct kiocb *)opt; + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + if (mapping->a_ops->is_partially_uptodate) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->is_partially_uptodate); + return ret; + } + if ((check_func_name("generic_file_buffered_read")) && strstr(pointer, "mapping->a_ops->readpages")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping->a_ops->readpages) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->readpages); + return ret; + } +#endif + if ((check_func_name("truncate_error_page")) && strstr(pointer, "mapping->a_ops->error_remove_page")) { + struct address_space * mapping = (struct address_space *)opt3; + if (mapping->a_ops->error_remove_page) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->error_remove_page); + return ret; + } + + /***************** scheduler *****************/ + // check `pick_next_task` and `__pick_next_task` + if (check_func_name("pick_next_task") && strstr(pointer, "fair_sched_class.pick_next_task")) { + sprintf(func, "%s", "pick_next_task_fair"); + return ret; + } + if (check_func_name("select_task_rq") && strstr(pointer, "p->sched_class->select_task_rq")) { + struct task_struct *p = (struct task_struct *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, p->sched_class->select_task_rq); + return ret; + } + if (check_func_name("enqueue_task") && strstr(pointer, "p->sched_class->enqueue_task")) { + struct task_struct *p = (struct task_struct *)opt2; + sprintf(func, FUNCTION_PRINT_FORMAT, p->sched_class->enqueue_task); + return ret; + } + if (check_func_name("dequeue_task") && strstr(pointer, "p->sched_class->dequeue_task")) { + struct task_struct *p = (struct task_struct *)opt2; + sprintf(func, FUNCTION_PRINT_FORMAT, p->sched_class->dequeue_task); + return ret; + } + if (check_func_name("do_sched_yield") && strstr(pointer, "current->sched_class->yield_task")) { + sprintf(func, FUNCTION_PRINT_FORMAT, current->sched_class->yield_task); + return ret; + } + if (check_func_name("yield_to") && strstr(pointer, "curr->sched_class->yield_to_task")) { + sprintf(func, FUNCTION_PRINT_FORMAT, current->sched_class->yield_to_task); + return ret; + } + if (check_func_name("put_prev_task") && strstr(pointer, "prev->sched_class->put_prev_task")) { + struct task_struct *prev = (struct task_struct *)opt2; + sprintf(func, FUNCTION_PRINT_FORMAT, prev->sched_class->put_prev_task); + return ret; + } + if (check_func_name("do_set_cpus_allowed") && strstr(pointer, "p->sched_class->set_cpus_allowed")) { + struct task_struct *p = (struct task_struct *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, p->sched_class->set_cpus_allowed); + return ret; + } + if (check_func_name("task_sched_runtime") && strstr(pointer, "p->sched_class->update_curr")) { + struct task_struct *p = (struct task_struct *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, p->sched_class->update_curr); + return ret; + } + + /***************** network *****************/ + if (check_func_name("__sock_release") && strstr(pointer, "sock->ops->release")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->release); + return ret; + } + if ((check_func_name("kernel_bind") || check_func_name("bind_mcastif_addr")) && strstr(pointer, "sock->ops->bind")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->bind); + return ret; + } + if (check_func_name("kernel_bind") && strstr(pointer, "sock->ops->bind")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->bind); + return ret; + } + if (check_func_name("make_send_sock") && strstr(pointer, "sock->ops->connect")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->bind); + return ret; + } + if (check_func_name("sock_poll") && strstr(pointer, "sock->ops->poll")) { + struct file *file = (struct file *)opt; + struct socket *sock = file->private_data; + if (sock->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->poll); + return ret; + } +#if 0 + if (check_func_name("rfkill_poll") && strstr(pointer, "rfkill->ops->poll")) { + struct work_struct *work = (struct work_struct *)opt; + struct rfkill *rfkill = container_of(work, struct rfkill, poll_work.work); + if (rfkill && rfkill->ops) + sprintf(func, FUNCTION_PRINT_FORMAT, rfkill->ops->poll); + return ret; + } +#endif + if (check_func_name("sock_do_ioctl") && strstr(pointer, "sock->ops->ioctl")) { + struct socket *sock = (struct socket *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->ioctl); + return ret; + } +#ifndef TK3 + if (check_func_name("sock_ioctl") && strstr(pointer, "sock->ops->gettstamp")) { + struct file *file = (struct file *)opt; + struct socket *sock = file->private_data; + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->gettstamp); + return ret; + } + if (check_func_name("compat_sock_ioctl_trans") && strstr(pointer, "sock->ops->gettstamp")) { + struct socket *sock = (struct socket *)opt2; + if (sock->ops->gettstamp) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->gettstamp); + return ret; + } +#endif + if (check_func_name("kernel_listen") && strstr(pointer, "sock->ops->listen")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->listen) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->listen); + return ret; + } + if ((check_func_name("sock_sendmsg_nosec") || check_func_name("sock_sendmsg"))&& strstr(pointer, "INDIRECT_CALL_INET")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->sendmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->sendmsg); + return ret; + } + if ((check_func_name("sock_recvmsg_nosec") || check_func_name("sock_recvmsg")) && strstr(pointer, "INDIRECT_CALL_INET")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->recvmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->recvmsg); + return ret; + } + if (check_func_name("sock_mmap") && strstr(pointer, "sock->ops->mmap")) { + struct file *file = (struct file *)opt; + struct socket *sock = file->private_data; + if (sock->ops->recvmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->recvmsg); + return ret; + } + if (check_func_name("sock_splice_read") && strstr(pointer, "sock->ops->splice_read")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->splice_read) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->splice_read); + return ret; + } + if (check_func_name("kernel_sendmsg_locked") && strstr(pointer, "sock->ops->sendmsg_locked")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->sendmsg_locked) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->sendmsg_locked); + return ret; + } +#ifndef TK3 + if (check_func_name("sock_setsockopt") && strstr(pointer, "sock->ops->set_rcvlowat")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->set_rcvlowat) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->set_rcvlowat); + return ret; + } + if (check_func_name("sock_setsockopt") && strstr(pointer, "sock->ops->set_rcvlowat")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->set_rcvlowat) + sprintf(func, FUNCTION_PRINT_FORMAT, sock->ops->set_rcvlowat); + return ret; + } + if (check_func_name("__inet_stream_connect") && strstr(pointer, "sk->sk_prot->pre_connect")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->pre_connect) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->pre_connect); + return ret; + } +#endif + if ((check_func_name("inet_sendmsg") || check_func_name("inet6_sendmsg")) && strstr(pointer, "INDIRECT_CALL_2")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->sendmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->sendmsg); + return ret; + } + // check `inet_stream_connect` and `__inet_stream_connect` + if ((check_func_name("pn_socket_connect") || check_func_name("inet_stream_connect") + || check_func_name("ieee802154_sock_connect")) + && strstr(pointer, "sk->sk_prot->connect")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->connect) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->connect); + return ret; + } + // check `inet_stream_connect` and `__inet_stream_connect` + if ((check_func_name("inet_stream_connect") || check_func_name("inet_shutdown") + || check_func_name("ieee802154_sock_connect")) + && strstr(pointer, "sk->sk_prot->disconnect")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->disconnect) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->disconnect); + return ret; + } + if (check_func_name("pn_socket_ioctl") && strstr(pointer, "sk->sk_prot->accept")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->accept) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->accept); + return ret; + } + if ((check_func_name("pn_socket_ioctl") || check_func_name("inet6_ioctl") + || check_func_name("inet_ioctl") || check_func_name("ieee802151_sock_ioctl")) + && strstr(pointer, "sk->sk_prot->ioctl")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->ioctl) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->ioctl); + return ret; + } + if (check_func_name("compat_sock_common_setsockopt") && strstr(pointer, "sk->sk_prot->setsockopt")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->setsockopt) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->setsockopt); + return ret; + } + if (check_func_name("compat_sock_common_getsockopt") && strstr(pointer, "sk->sk_prot->getsockopt")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->getsockopt) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->getsockopt); + return ret; + } + if (check_func_name("sock_setsockopt") && strstr(pointer, "sk->sk_prot->keepalive")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->keepalive) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->keepalive); + return ret; + } + if (check_func_name("sock_common_recvmsg") && strstr(pointer, "sk->sk_prot->recvmsg")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->recvmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->recvmsg); + return ret; + } + // check `sk_backlog_rcv` and `__sk_backlog_rcv` + if (check_func_name("sk_backlog_rcv") && strstr(pointer, "sk->sk_backlog_rcv")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_backlog_rcv) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_backlog_rcv); + return ret; + } + if (check_func_name("release_sock") && strstr(pointer, "sk->sk_prot->release_cb")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->release_cb) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->release_cb); + return ret; + } + // check `inet6_bind` and `__inet6_bind` + // check `inet_bind` and `__inet_bind` + if ((check_func_name("pn_socket_bind") || check_func_name("inet6_bind") + || check_func_name("inet_csk_listen_start") || check_func_name("inet_bind") + || check_func_name("inet_autobind")) + && strstr(pointer, "sk->sk_prot->get_port")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->get_port) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->release_cb); + return ret; + } +#ifdef TK4_NEW_1 + // check `inet6_bind` and `__inet6_bind` + if (check_func_name("inet6_bind") && strstr(pointer, "sk->sk_prot->put_port")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->put_port) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->put_port); + return ret; + } + // check `inet_bind` and `__inet_bind` + if ((check_func_name("inet_autobind") || check_func_name("inet_bind")) && strstr(pointer, "sk->sk_prot->put_port")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->put_port) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->put_port); + return ret; + } +#endif + if ((check_func_name("sk_enter_memory_pressure") || check_func_name("sk_stream_alloc_skb") + || check_func_name("tls_do_allocation")) && strstr(pointer, "sk->sk_prot->enter_memory_pressure")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->enter_memory_pressure) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->enter_memory_pressure); + return ret; + } + if (check_func_name("sk_leave_memory_pressure") && strstr(pointer, "sk->sk_prot->leave_memory_pressure")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->leave_memory_pressure) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->leave_memory_pressure); + return ret; + } + // check `sk_stream_memory_free` and `__sk_stream_memory_free` + if (check_func_name("sk_stream_memory_free") && strstr(pointer, "sk->sk_prot->stream_memory_free")) { + struct sock *sk = (struct sock *)opt; + if (sk->sk_prot->stream_memory_free) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->stream_memory_free); + return ret; + } + if ((check_func_name("inet_recvmsg") || check_func_name("inet_sendmsg") + || check_func_name("inet6_recvmsg") || check_func_name("inet6_sendmsg")) + && strstr(pointer, "INDIRECT_CALL_2")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->recvmsg) + sprintf(func, FUNCTION_PRINT_FORMAT, sk->sk_prot->recvmsg); + return ret; + } +#ifndef TK5 + if (check_func_name("generic_make_request")) { + struct bio *bio = (struct bio *)opt; + struct request_queue *q; + if (bio && bio->bi_disk) + q= bio->bi_disk->queue; + if (q && q->make_request_fn) + sprintf(func, FUNCTION_PRINT_FORMAT, q->make_request_fn); + return ret; + } +#endif +#endif + return false; +} diff --git a/ops/os_stat/os_stat/func_pointer_table_5_4.c b/ops/os_stat/os_stat/func_pointer_table_5_4.c new file mode 100644 index 0000000000000000000000000000000000000000..0e017d2314d842458d76496f7936074414741acb --- /dev/null +++ b/ops/os_stat/os_stat/func_pointer_table_5_4.c @@ -0,0 +1,146 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#if defined(TK4) || defined(TK4_NEW) +#include +#else +#include +#endif +#include +#include +#include "kprobe_prehook.h" +#include "syms.h" +#include "./include/kernel/sched/sched.h" +#include "./include/drivers/target/target_core_file.h" +#ifndef TK2 +#include "./include/drivers/block/loop.h" +#endif + +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +extern unsigned int sysctl_module_debug; +struct shm_file_data { + int id; + struct ipc_namespace *ns; + struct file *file; + const struct vm_operations_struct *vm_ops; +}; +bool sub_print_func_name(void *opt, void *opt2, void *opt3, char *pointer, char *func) +{ + bool ret = true; + + +#ifndef TK2 + if ((check_func_name("nobh_truncate_page") || check_func_name("do_read_cache_page") || check_func_name("read_pages")) + && strstr(pointer, "mapping->a_ops->readpage")) { + struct address_space *mapping = (struct address_space *)opt; + sprintf(func, "%pF", mapping->a_ops->readpage); + return ret; + } + if ((check_func_name("generic_file_buffered_read")) && strstr(pointer, "mapping->a_ops->readpage")) { + struct kiocb *iocb = (struct kiocb *)opt; + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + sprintf(func, "%pF", mapping->a_ops->readpage); + return ret; + } + if ((check_func_name("filemap_fault")) && strstr(pointer, "mapping->a_ops->readpage")) { + struct vm_fault *vmf = (struct vm_fault *)opt; + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + sprintf(func, "%pF", mapping->a_ops->readpage); + return ret; + } + if ((check_func_name("filemap_fault")) && strstr(pointer, "mapping->a_ops->readpage")) { + struct vm_fault *vmf = (struct vm_fault *)opt; + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + sprintf(func, "%pF", mapping->a_ops->readpage); + return ret; + } + if (check_func_name("read_pages") && strstr(pointer, "mapping->a_ops->readpages")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping->a_ops->readpages) + sprintf(func, "%pF", mapping->a_ops->readpages); + return ret; + } + if ((check_func_name("writeout")) && strstr(pointer, "mapping->a_ops->writepage")) { + struct address_space *mapping = (struct address_space *)opt; + if (mapping && mapping->a_ops->writepage) + sprintf(func, "%pF", mapping->a_ops->writepage); + return ret; + } + if ((check_func_name("__writepage")) && strstr(pointer, "mapping->a_ops->writepage")) { + struct address_space *mapping = (struct address_space *)opt3; + if (mapping && mapping->a_ops->writepage) + sprintf(func, "%pF", mapping->a_ops->writepage); + return ret; + } + if ((check_func_name("pageout")) && strstr(pointer, "mapping->a_ops->writepage")) { + struct address_space *mapping = (struct address_space *)opt2; + if (mapping && mapping->a_ops->writepage) + sprintf(func, "%pF", mapping->a_ops->writepage); + return ret; + } + if ((check_func_name("set_page_dirty")) && strstr(pointer, "spd")) { + struct page *page = (struct page *)opt; + struct address_space *mapping = page_mapping(page); + if (mapping && mapping->a_ops->set_page_dirty) + sprintf(func, "%pF", mapping->a_ops->set_page_dirty); + return ret; + } + if ((check_func_name("ext4_writepage")) && strstr(pointer, "inode->i_mapping->a_ops->invalidatepage")) { + struct page *page = (struct page *)opt; + struct inode *inode = page->mapping->host; + sprintf(func, "%pF", inode->i_mapping->a_ops->invalidatepage); + return ret; + } + if ((check_func_name("do_invalidatepage")) && strstr(pointer, "invalidatepage")) { + struct page *page = (struct page *)opt; + if (page->mapping->a_ops->invalidatepage) + sprintf(func, "%pF", page->mapping->a_ops->invalidatepage); + return ret; + } + if ((check_func_name("try_to_release_page")) && strstr(pointer, "mapping->a_ops->releasepage")) { + struct page *page = (struct page *)opt; + struct address_space * const mapping = page->mapping; + if (mapping && mapping->a_ops->releasepage) + sprintf(func, "%pF", mapping->a_ops->releasepage); + return ret; + } + if ((check_func_name("move_to_new_page")) && strstr(pointer, "mapping->a_ops->migratepage")) { + struct page *page = (struct page *)opt2; + struct address_space * const mapping = page->mapping; + if (mapping && mapping->a_ops->migratepage) + sprintf(func, "%pF", mapping->a_ops->migratepage); + return ret; + } + if (check_func_name("check_preempt_curr") && strstr(pointer, "rq->curr->sched_class->check_preempt_curr")) { + struct rq *rq = (struct rq *)opt; + sprintf(func, "%pF", rq->curr->sched_class->check_preempt_curr); + return ret; + } + if (check_func_name("kernel_sendpage") && strstr(pointer, "sock->ops->sendpage")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->sendpage) + sprintf(func, "%pF", sock->ops->sendpage); + return ret; + } + if (check_func_name("kernel_sendpage_locked") && strstr(pointer, "sock->ops->sendpage_locked")) { + struct socket *sock = (struct socket *)opt; + if (sock->ops->sendpage_locked) + sprintf(func, "%pF", sock->ops->sendpage_locked); + return ret; + } + if (check_func_name("inet_sendpage") && strstr(pointer, "sk->sk_prot->sendpage")) { + struct socket *sock = (struct socket *)opt; + struct sock *sk = sock->sk; + if (sk->sk_prot->sendpage) + sprintf(func, "%pF", sk->sk_prot->sendpage); + return ret; + } +#endif + return false; +} diff --git a/ops/os_stat/os_stat/func_pointer_table_6_6.c b/ops/os_stat/os_stat/func_pointer_table_6_6.c new file mode 100644 index 0000000000000000000000000000000000000000..1f97ce3cccd96038afe7a24a003011b28c3d81ac --- /dev/null +++ b/ops/os_stat/os_stat/func_pointer_table_6_6.c @@ -0,0 +1,114 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#if defined(TK4) || defined(TK4_NEW) +#include +#else +#include +#endif +#include +#include +#include "kprobe_prehook.h" +#include "syms.h" +#include "./include/kernel/sched/sched.h" +#include "./include/drivers/target/target_core_file.h" +#include "./include/drivers/block/loop.h" + +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +extern unsigned int sysctl_module_debug; +struct shm_file_data { + int id; + struct ipc_namespace *ns; + struct file *file; + const struct vm_operations_struct *vm_ops; +}; +struct address_space *folio_mapping_ftrace(struct folio *folio) +{ + struct address_space *mapping; + + /* This happens if someone calls flush_dcache_page on slab page */ + if (unlikely(folio_test_slab(folio))) + return NULL; + + if (unlikely(folio_test_swapcache(folio))) + return NULL; + + mapping = folio->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + return NULL; + + return mapping; +} + +bool sub_print_func_name(void *opt, void *opt2, void *opt3, char *pointer, char *func) +{ + bool ret = true; + + + if ((check_func_name("nobh_truncate_page") || check_func_name("do_read_cache_page") || check_func_name("read_pages")) + && strstr(pointer, "mapping->a_ops->read_folio")) { + struct address_space *mapping = (struct address_space *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->read_folio); + return ret; + } + if ((check_func_name("generic_file_buffered_read")) && strstr(pointer, "mapping->a_ops->read_folio")) { + struct kiocb *iocb = (struct kiocb *)opt; + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->read_folio); + return ret; + } + if ((check_func_name("filemap_fault")) && strstr(pointer, "mapping->a_ops->read_folio")) { + struct vm_fault *vmf = (struct vm_fault *)opt; + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->read_folio); + return ret; + } + if ((check_func_name("filemap_fault")) && strstr(pointer, "mapping->a_ops->read_folio")) { + struct vm_fault *vmf = (struct vm_fault *)opt; + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->read_folio); + return ret; + } + if ((check_func_name("read_pages")) && strstr(pointer, "ops->readahead")) { + struct readahead_control *rac = (struct readahead_control *)opt; + const struct address_space_operations *aops = rac->mapping->a_ops; + if (rac && aops->readahead) + sprintf(func, FUNCTION_PRINT_FORMAT, aops->writepage); + return ret; + } + if ((check_func_name("folio_invalidate")) && strstr(pointer, "a_ops->invalidate_folio")) { + struct folio *folio = (struct folio *)opt; + const struct address_space_operations *aops = folio->mapping->a_ops; + if (aops->invalidate_folio) + sprintf(func, FUNCTION_PRINT_FORMAT, aops->invalidate_folio); + return ret; + } + if ((check_func_name("filemap_release_folio")) && strstr(pointer, "mapping->a_ops->release_folio")) { + struct folio *folio = (struct folio *)opt; + struct address_space * const mapping = folio->mapping; + if (mapping && mapping->a_ops->release_folio) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->release_folio); + return ret; + } + if ((check_func_name("move_to_new_folio")) && strstr(pointer, "mapping->a_ops->migrate_folio")) { + struct folio *src = (struct folio *)opt2; + struct address_space *mapping = folio_mapping_ftrace(src); + if (mapping && mapping->a_ops->migrate_folio) + sprintf(func, FUNCTION_PRINT_FORMAT, mapping->a_ops->migrate_folio); + return ret; + } + if (strstr(pointer, "rq->curr->sched_class->wakeup_preempt")) { + struct rq *rq = (struct rq *)opt; + sprintf(func, FUNCTION_PRINT_FORMAT, rq->curr->sched_class->wakeup_preempt); + return ret; + } + + return false; +} diff --git a/ops/os_stat/os_stat/func_struct_table.c b/ops/os_stat/os_stat/func_struct_table.c new file mode 100644 index 0000000000000000000000000000000000000000..b8b8d28099c5e30cf215d3737228dcab2f61885e --- /dev/null +++ b/ops/os_stat/os_stat/func_struct_table.c @@ -0,0 +1,58 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#if defined(TK4) || defined(TK4_NEW) +#include +#else +#include +#endif +#include #include +#include "kprobe_prehook.h" +#include "syms.h" +#include "./include/kernel/sched/sched.h" +#ifndef TK2 +#include "./include/drivers/target/target_core_file.h" +#include "./include/drivers/block/loop.h" +#endif + +#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) +extern unsigned int sysctl_module_debug; +extern unsigned int sysctl_module_print; +extern char printk_name_first[NAME_MAX]; +extern char printk_name_last[NAME_MAX]; +void print_info(void *opt, void *opt2, void *opt3) +{ + bool ret; + +#ifndef TK2 + if (sysctl_module_debug != FTRACE_CONTROL_POINTER || sysctl_module_print == 0) + return; + + ret = print_func_name(opt, opt2, opt3, printk_name_first, printk_name_last); + if (ret) + goto out; + + /***************** fs *****************/ + if (strstr(printk_name_first, "file->f_path.dentry->d_iname")) { + struct file *file = (struct file *)opt; + sprintf(printk_name_last, "%s\0", file->f_path.dentry->d_iname); + ret = true; + } +#ifndef TK3 + if (strstr(printk_name_first, "info->si_signo")) { + struct kernel_siginfo *info = (struct kernel_siginfo *)opt2; + /* demo:print signo */ + if (info > SEND_SIG_PRIV) + sprintf(printk_name_last, "%d\0", info->si_signo); + ret = true; + } +#endif + +out: + if (ret && sysctl_module_print > 1) + pr_info("%s, proc:%s, %d", printk_name_last, current->comm, current->pid); +#endif +} diff --git a/ops/os_stat/os_stat/hook.c b/ops/os_stat/os_stat/hook.c new file mode 100644 index 0000000000000000000000000000000000000000..8eaee093ca3a3c8125723a14a8141b8bee26b8e8 --- /dev/null +++ b/ops/os_stat/os_stat/hook.c @@ -0,0 +1,371 @@ +/* + * Kernel dynamic hooks based on ftrace + * aurelianliu@tencent.com + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#ifndef TK5 +#include +#endif +#include +#include +#include +#include +#include +#include +#include "version.h" +#include "hook.h" +#include "data_aware.h" + +#ifdef CONFIG_FUNCTION_TRACER +#define CC_USING_FENTRY +#endif + +/* + * Patched functions + */ +#ifdef CC_USING_FENTRY +static struct verify_func verify_funcs_system[] = +{ +#ifdef TK5 + {NULL, NULL, "__alloc_pages", 0, 0, (unsigned long)stat_alloc_pages_nodemask, 0}, + {NULL, NULL, "handle_mm_fault", 0, 0, (unsigned long)stat_handle_mm_fault, 0}, + {NULL, NULL, "blk_mq_dispatch_rq_list", 0, 0, (unsigned long)stat_blk_mq_dispatch_rq_list, 0}, +#else + {NULL, NULL, "__alloc_pages_nodemask", 0, 0, (unsigned long)stat_alloc_pages_nodemask, 0}, + {NULL, NULL, "blk_mq_get_driver_tag", 0, 0, (unsigned long)stat_blk_mq_get_driver_tag, 0}, +#ifndef TK3 + {NULL, NULL, "do_page_fault", 0, 0, (unsigned long)stat_do_page_fault, 0}, +#endif +#endif +#if defined(TK4_OLD) || defined(TK3) || defined(TK2) ||CONFIG_ARM64 + {NULL, NULL, "pick_next_task_fair", 0, 0, (unsigned long)stat_pick_next_task, 0}, +#else + {NULL, NULL, "pick_next_task", 0, 0, (unsigned long)stat_pick_next_task, 0}, +#endif +#ifdef TK2 + {NULL, NULL, "__sched_fork", 0, 0, (unsigned long)stat__sched_fork, 0}, + {NULL, NULL, "do_exit", 0, 0, (unsigned long)stat_do_exit, 0}, +#endif + {NULL, NULL, "submit_bio", 0, 0, (unsigned long)stat_submit_bio, 0}, + {NULL, NULL, "bio_endio", 0, 0, (unsigned long)stat_bio_endio, 0}, + {NULL, NULL, "__kmalloc", 0, 0, (unsigned long)stat__kmalloc, 0}, + {NULL, NULL, "__kmalloc_node", 0, 0, (unsigned long)stat__kmalloc_node, 0}, + {NULL, NULL, "kmem_cache_alloc", 0, 0, (unsigned long)stat_kmem_cache_alloc, 0}, +}; +static struct verify_func verify_funcs_init[] = +{ +#ifdef TK5 + {NULL, NULL, "psi_task_switch", 0, 0, (unsigned long)stat_psi_task_switch, 0}, +#else + {NULL, NULL, "finish_task_switch", 0, 0, (unsigned long)stat_finish_task_switch, 0}, +#ifdef TK4_NEW + {NULL, NULL, "sched_rqm_switch", 0, 0, (unsigned long)stat_sched_rqm_switch, 0}, +#else + {NULL, NULL, "rcu_note_context_switch", 0, 0, (unsigned long)stat_rcu_note_context_switch, 0}, +#endif +#endif +#ifdef CONFIG_X86_64 +#ifdef TK5 + {NULL, NULL, "handle_level_irq", 0, 0, (unsigned long)stat_handle_level_irq, 0}, + {NULL, NULL, "handle_fasteoi_irq", 0, 0, (unsigned long)stat_handle_fasteoi_irq, 0}, + {NULL, NULL, "handle_edge_irq", 0, 0, (unsigned long)stat_handle_edge_irq, 0}, + {NULL, NULL, "handle_simple_irq", 0, 0, (unsigned long)stat_handle_simple_irq, 0}, +#else + {NULL, NULL, "do_IRQ", 0, 0, (unsigned long)stat_do_IRQ, 0}, +#endif +#endif +#ifdef CONFIG_ARM64 + {NULL, NULL, "gic_handle_irq", 0, 0, (unsigned long)stat_gic_handle_irq, 0}, +#endif + {NULL, NULL, "do_kern_addr_fault", 0, 0, (unsigned long)stat_do_kern_addr_fault, 0}, + {NULL, NULL, "no_context", 0, 0, (unsigned long)stat_no_context, 0}, + {NULL, NULL, "do_general_protection", 0, 0, (unsigned long)stat_do_general_protection, 0}, + {NULL, NULL, "do_divide_error", 0, 0, (unsigned long)stat_do_divide_error, 0}, +}; + //{NULL, NULL, "do_syscall_64", 0, 0, (unsigned long)stat_do_syscall_64, 0}, + +#else + +#error "error, compiler donot support fentry?"; + +#endif + +static bool system_hooked; +int system_hook_name; +struct percpu_counter patch_num; +struct percpu_counter patch_num_system; + +static struct module *get_link_module(char *modname) +{ + struct module *mod; + +#ifdef TK5 + mutex_lock(module_mutex_tk5); + mod = find_module_tk5(modname); +#else + mutex_lock(&module_mutex); + mod = find_module(modname); +#endif + if (!mod) { +#ifdef TK5 + mutex_unlock(module_mutex_tk5); +#else + mutex_unlock(&module_mutex); +#endif + return NULL; + } + + WARN_ON(!try_module_get(mod)); +#ifdef TK5 + mutex_unlock(module_mutex_tk5); +#else + mutex_unlock(&module_mutex); +#endif + + return mod; +} + +static inline void put_link_modules(struct module *mod) +{ + if (mod) + module_put(mod); +} + +static int verify_kernel(struct verify_func verify_funcs[], int count) +{ + int i; + char name[256]; + + for (i = 0; i < count; i++) { + if (verify_funcs[i].modname) { + verify_funcs[i].mod = get_link_module(verify_funcs[i].modname); + if (!verify_funcs[i].mod){ + pr_err("unable to find module '%s'\n", verify_funcs[i].modname); + return -ENXIO; + } + snprintf(name, sizeof(name), "%s:%s",verify_funcs[i].modname, verify_funcs[i].name); + } else { + strcpy(name, verify_funcs[i].name); + } + + /* check symbol */ +#ifdef TK5 + verify_funcs[i].old_addr = kallsyms_lookup_name_tk5(name); +#else + verify_funcs[i].old_addr = kallsyms_lookup_name(name); +#endif + if (!verify_funcs[i].old_addr) { + pr_err("unable to find symbol '%s'\n", name); + return -ENXIO; + } + pr_info("find symbol '%s', %lx\n", name, verify_funcs[i].old_addr); + } + return 0; +} + +static struct verify_func *tpatch_find_match_ip(struct verify_func verify_funcs[], unsigned long ip, int count) +{ + int i; + + if (stat_hook_function != 1) + return NULL; + for ( i =0; i < count ; i++) { + if (ip == verify_funcs[i].old_addr + verify_funcs[i].old_offset) { + return &verify_funcs[i]; + } + } + + return NULL; +} + +/* Update regs->ip to tell ftrace to return + * to the new function.*/ +#ifdef TK5 +static void notrace tpatch_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct ftrace_regs *regs, struct verify_func verify_funcs[], int count) +#else +static void notrace tpatch_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct pt_regs *regs, struct verify_func verify_funcs[], int count) +#endif +{ + struct verify_func *func; + + preempt_disable_notrace(); + + func = tpatch_find_match_ip(verify_funcs, ip, count); + if (func) +#ifdef CONFIG_X86 +#ifdef TK5 + regs->regs.ip = func->new_addr + MCOUNT_INSN_SIZE; +#else + regs->ip = func->new_addr + MCOUNT_INSN_SIZE; +#endif +#elif defined(CONFIG_ARM64) +#ifdef TK5 + regs->regs.pc = func->new_addr + MCOUNT_INSN_SIZE; +#else + regs->pc = func->new_addr + MCOUNT_INSN_SIZE; +#endif +#endif + + preempt_enable_notrace(); +} + +#ifdef TK5 +static void notrace tpatch_ftrace_handler_init(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct ftrace_regs *regs) +#else +static void notrace tpatch_ftrace_handler_init(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct pt_regs *regs) +#endif +{ + int count; + count = ARRAY_SIZE(verify_funcs_init); + tpatch_ftrace_handler(ip, parent_ip, fops, regs, verify_funcs_init, count); +} +#ifdef TK5 +static void notrace tpatch_ftrace_handler_system(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct ftrace_regs *regs) +#else +static void notrace tpatch_ftrace_handler_system(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *fops, struct pt_regs *regs) +#endif +{ + int count; + count = ARRAY_SIZE(verify_funcs_system); + tpatch_ftrace_handler(ip, parent_ip, fops, regs, verify_funcs_system, count); +} + +static struct ftrace_ops tpatch_ftrace_ops_init __read_mostly = { + .func = tpatch_ftrace_handler_init, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static struct ftrace_ops tpatch_ftrace_ops_system __read_mostly = { + .func = tpatch_ftrace_handler_system, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static int patch_kernel(struct ftrace_ops *tpatch_ftrace_ops, struct verify_func verify_funcs[], int count) +{ + int ret, i = 0; + + for ( i =0; i < count ; i++) { + ret = ftrace_set_filter(tpatch_ftrace_ops, verify_funcs[i].name, strlen(verify_funcs[i].name), 0); + if (ret < 0) { + pr_err("can't set ftrace filter func:%s at address 0x%lx, ret(%d)\n", verify_funcs[i].name, + verify_funcs[i].old_addr, ret); + goto error; + } + + /* put module */ + put_link_modules(verify_funcs[i].mod); + } + + ret = register_ftrace_function(tpatch_ftrace_ops); + if (ret < 0) { + pr_err("can't register ftrace handler\n"); + goto error; + } + + return 0; + + error: + for (; i < count; i++) + put_link_modules(verify_funcs[i].mod); + + return ret; +} + +int system_base_function_hook(void) +{ + int ret, count; + + if (system_hooked) { + pr_info("hooked already."); + return 0; + } + /* Verify patched functions */ + count = ARRAY_SIZE(verify_funcs_system); + ret = verify_kernel(verify_funcs_system, count); + if(ret < 0) { + pr_err("Incorrect kernel, or function not found\n"); + return -ENODEV; + } +#if defined(TK2) && !defined(KVM3) + percpu_counter_init(&patch_num_system, 0); +#else + percpu_counter_init(&patch_num_system, 0, GFP_KERNEL); +#endif + + /* Ok, try to replace target functions */ + ret = patch_kernel(&tpatch_ftrace_ops_system, verify_funcs_system, count); + if (ret < 0) + percpu_counter_destroy(&patch_num_system); + else + system_hooked = true; + + return ret; +} +int system_base_function_unhook(void) +{ + if (!system_hooked) + return 0; + + system_hooked = false; + /* Destroy ftrace filter */ + unregister_ftrace_function(&tpatch_ftrace_ops_system); + synchronize_rcu(); + + /* Wait all exit patched function */ + while (percpu_counter_sum(&patch_num_system)) + msleep(1); + + percpu_counter_destroy(&patch_num_system); + + return 0; +} + +int __init patch_init(void) +{ + int ret, count; + + /* Verify patched functions */ + count = ARRAY_SIZE(verify_funcs_init); + ret = verify_kernel(verify_funcs_init, count); + if(ret < 0) { + pr_err("Incorrect kernel, or function not found\n"); + return -ENODEV; + } + +#if defined(TK2) && !defined(KVM3) + percpu_counter_init(&patch_num, 0); +#else + percpu_counter_init(&patch_num, 0, GFP_KERNEL); +#endif + /* Ok, try to replace target functions */ + ret = patch_kernel(&tpatch_ftrace_ops_init, verify_funcs_init, count); + if (ret < 0) { + percpu_counter_destroy(&patch_num); + return ret; + } + + return 0; +} + +void patch_exit(void) +{ + if (system_hooked) + system_base_function_unhook(); + system_hooked = false; + /* Destroy ftrace filter */ + unregister_ftrace_function(&tpatch_ftrace_ops_init); + synchronize_rcu(); + + /* Wait all exit patched function */ + while (percpu_counter_sum(&patch_num)) + msleep(1); + + percpu_counter_destroy(&patch_num); +} diff --git a/ops/os_stat/os_stat/hook.h b/ops/os_stat/os_stat/hook.h new file mode 100644 index 0000000000000000000000000000000000000000..f23a8a76951c9cb8d044862f866b837d6b100888 --- /dev/null +++ b/ops/os_stat/os_stat/hook.h @@ -0,0 +1,69 @@ +/* + * Kpatch module + * + * The core code comes from tpatch, the early hot patch tool of tlinux. + * + */ +#ifndef _TPATCH_H +#define _TPATCH_H + +#include +#include +#include "hook_tk5.h" + +struct verify_func +{ + char *modname; + struct module *mod; + char *name; + unsigned long old_addr; + unsigned long old_offset; + unsigned long new_addr; + unsigned long new_offset; +}; + +extern int system_hook_name; +extern struct percpu_counter patch_num; +extern struct percpu_counter patch_num_system; +extern void context_check_end(void); +extern void context_check_start(void); + +static inline void enter_hook(void) +{ + percpu_counter_inc(&patch_num); + context_check_start(); +} + +static inline void exit_hook(void) +{ + context_check_end(); + percpu_counter_dec(&patch_num); +} +static inline void enter_hook_special(void) +{ + percpu_counter_inc(&patch_num_system); +} + +static inline void exit_hook_special(void) +{ + percpu_counter_dec(&patch_num_system); +} +static inline void enter_hook_system(void) +{ + percpu_counter_inc(&patch_num_system); + context_check_start(); +} + +static inline void exit_hook_system(void) +{ + context_check_end(); + percpu_counter_dec(&patch_num_system); +} + +extern int system_base_function_hook(void); +extern int system_base_function_unhook(void); + +extern int patch_init(void); +extern void patch_exit(void); + +#endif diff --git a/ops/os_stat/os_stat/hook_tk5.c b/ops/os_stat/os_stat/hook_tk5.c new file mode 100644 index 0000000000000000000000000000000000000000..93db639d930dbd640fb10502e60f927eb276a6f4 --- /dev/null +++ b/ops/os_stat/os_stat/hook_tk5.c @@ -0,0 +1,42 @@ +#include +#include +#include +#include "hook_tk5.h" + +#ifdef TK5 +static struct kprobe kp = { + .symbol_name = "kallsyms_lookup_name" +}; + +kallsyms_lookup_name_t kallsyms_lookup_name_tk5; +find_module_t find_module_tk5; +int ftrace_init_tk5(void) +{ + int ret, i; + + ret = register_kprobe(&kp); + kallsyms_lookup_name_tk5 = (kallsyms_lookup_name_t) kp.addr; + unregister_kprobe(&kp); + + kp.symbol_name = "find_moudle"; + register_kprobe(&kp); + find_module_tk5 = (find_module_t)kp.addr; + unregister_kprobe(&kp); + + return 0; +} + +unsigned long find_symbols(char *symbols) +{ + int ret; + unsigned long addr; + + kp.symbol_name = symbols; + ret = register_kprobe(&kp); + addr = kp.addr; + unregister_kprobe(&kp); + return addr; +} + + +#endif diff --git a/ops/os_stat/os_stat/hook_tk5.h b/ops/os_stat/os_stat/hook_tk5.h new file mode 100644 index 0000000000000000000000000000000000000000..1e77523e9e53b8f104619e7d557835491b251a6d --- /dev/null +++ b/ops/os_stat/os_stat/hook_tk5.h @@ -0,0 +1,15 @@ +#ifndef _HOOK_TK5_H +#define _HOOK_TK5_H +#include "version.h" +#ifdef TK5 +typedef unsigned long (*kallsyms_lookup_name_t)(const char *name); +extern kallsyms_lookup_name_t kallsyms_lookup_name_tk5; + +typedef struct module * (*find_module_t)(const char *name); +extern find_module_t find_module_tk5; + +extern int ftrace_init_tk5(void); +extern unsigned long find_symbols(char *symbols); + +#endif +#endif diff --git a/ops/os_stat/os_stat/include b/ops/os_stat/os_stat/include new file mode 120000 index 0000000000000000000000000000000000000000..241d3067efc8be16ec9a385923651bcb395017c9 --- /dev/null +++ b/ops/os_stat/os_stat/include @@ -0,0 +1 @@ +include_private \ No newline at end of file diff --git a/ops/os_stat/os_stat/include_6_6/arch/x86/include/asm/syscall.h b/ops/os_stat/os_stat/include_6_6/arch/x86/include/asm/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..03bb950eba690f332fcdf71f996681770e4355bd --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/arch/x86/include/asm/syscall.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include +#include +#include +#include /* for TS_COMPAT */ +#include + +/* This is used purely for kernel/trace/trace_syscalls.c */ +typedef long (*sys_call_ptr_t)(const struct pt_regs *); +extern const sys_call_ptr_t sys_call_table[]; + +/* + * These may not exist, but still put the prototypes in so we + * can use IS_ENABLED(). + */ +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); + +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + memcpy(args, ®s->bx, 6 * sizeof(args[0])); +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + return AUDIT_ARCH_I386; +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + *args++ = regs->bx; + *args++ = regs->cx; + *args++ = regs->dx; + *args++ = regs->si; + *args++ = regs->di; + *args = regs->bp; + } else +# endif + { + *args++ = regs->di; + *args++ = regs->si; + *args++ = regs->dx; + *args++ = regs->r10; + *args++ = regs->r8; + *args = regs->r9; + } +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return (IS_ENABLED(CONFIG_IA32_EMULATION) && + task->thread_info.status & TS_COMPAT) + ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; +} + +void do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); + +#endif /* CONFIG_X86_32 */ + +void do_int80_syscall_32(struct pt_regs *regs); +long do_fast_syscall_32(struct pt_regs *regs); +long do_SYSENTER_32(struct pt_regs *regs); + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/ops/os_stat/os_stat/include_6_6/drivers/block/loop.h b/ops/os_stat/os_stat/include_6_6/drivers/block/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..73c6c173ef119b032dda4e54c619426e30ac7266 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/drivers/block/loop.h @@ -0,0 +1,58 @@ +#include +#include + +struct loop_device { + int lo_number; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + char lo_file_name[LO_NAME_SIZE]; + + struct file * lo_backing_file; + struct block_device *lo_device; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; + struct mutex lo_mutex; + bool idr_visible; +}; + +struct loop_cmd { + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; +}; + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +static struct block_device *I_BDEV_FTRACE(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} diff --git a/ops/os_stat/os_stat/include_6_6/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_6_6/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..929b1ecd544ee0ffb84973b64867a3dabb8a2f45 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/drivers/target/target_core_file.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#include + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_MAX_DEV_PROT_NAME FD_MAX_DEV_NAME + 16 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 +#define FDBD_HAS_ASYNC_IO 0x08 +#define FDBD_FORMAT_UNIT_SIZE 2048 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + struct file *fd_prot_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_new/ext4.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..b0b7fa4e2cc345fdef9b9559764dbe1299fb67c6 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/ext4.h @@ -0,0 +1,3840 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif +#include + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ +#define EXT_DEBUG__ + +/* + * Dynamic printk for controlled extents debugging. + */ +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) +#else +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define ASSERT(assert) \ +do { \ + if (unlikely(!(assert))) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: '%s'\n", \ + __func__, __FILE__, __LINE__, #assert); \ + BUG(); \ + } \ +} while (0) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * For each criteria, mballoc has slightly different way of finding + * the required blocks nad usually, higher the criteria the slower the + * allocation. We start at lower criterias and keep falling back to + * higher ones if we are not able to find any blocks. Lower (earlier) + * criteria are faster. + */ +enum criteria { + /* + * Used when number of blocks needed is a power of 2. This + * doesn't trigger any disk IO except prefetch and is the + * fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most + * suitable group that satisfies goal request. No disk IO + * except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal + * length to the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if + * necessary, to find find_suitable block group. Tries to + * allocate goal length but might trim the request if nothing + * is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates + * those. This is only used in rare cases when + * CR_GOAL_LEN_SLOW also fails to allocate anything. + */ + CR_ANY_FREE, + + /* + * Number of criterias defined. + */ + EXT4_MB_NUM_CRS +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 +/* Large fragment size list lookup succeeded at least once for cr = 0 */ +#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + refcount_t count; /* reference counter */ + struct list_head list_vec; /* list of ext4_io_end_vec */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ +/* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); + CHECK_FLAG_VALUE(RESERVED); +} + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Caller is in the atomic contex, find extent if it has been cached */ +#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) +{ + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) +{ + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; +} + +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) + +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) + +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ +do { \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" +#include "fast_commit.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + * I_DATA_SEM_EA - Used for ea_inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, + I_DATA_SEM_EA +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_rwsem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; + + /* Fast commit related info */ + + /* For tracking dentry create updates */ + struct list_head i_fc_dilist; + struct list_head i_fc_list; /* + * inodes that need fast commit + * protected by sbi->s_fc_lock. + */ + + /* Start of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_start; + + /* End of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_len; + + /* Number of ongoing updates on this inode */ + atomic_t i_fc_updates; + + /* Fast commit wait queue for this inode */ + wait_queue_head_t i_fc_wait; + + /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ + struct mutex i_fc_lock; + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot __rcu *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ +#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX_ALWAYS 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void mb_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +#define EXT4_LABEL_MAX 16 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le32 s_reserved[94]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + atomic_t ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned long s_mount_flags; + unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + struct buffer_head *s_mmp_bh; + + /* Journaling */ + struct journal_s *s_journal; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *s_journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *s_system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; + struct list_head *s_mb_avg_fragment_size; + rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; + unsigned int s_mb_best_avail_max_trim_order; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + atomic_t s_bal_p2_aligned_bad_suggestions; + atomic_t s_bal_goal_fast_bad_suggestions; + atomic_t s_bal_best_avail_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + unsigned long s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; + + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; + u64 s_dax_part_off; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; + + /* Information about errors that happened during this mount */ + spinlock_t s_error_lock; + int s_add_error_count; + int s_first_error_code; + __u32 s_first_error_line; + __u32 s_first_error_ino; + __u64 s_first_error_block; + const char *s_first_error_func; + time64_t s_first_error_time; + int s_last_error_code; + __u32 s_last_error_line; + __u32 s_last_error_ino; + __u64 s_last_error_block; + const char *s_last_error_func; + time64_t s_last_error_time; + /* + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. + */ + struct work_struct s_sb_upd_work; + + /* Ext4 fast commit sub transaction ID */ + atomic_t s_fc_subtid; + + /* + * After commit starts, the main queue gets locked, and the further + * updates get added in the staging queue. + */ +#define FC_Q_MAIN 0 +#define FC_Q_STAGING 1 + struct list_head s_fc_q[2]; /* Inodes staged for fast commit + * that have data changes in them. + */ + struct list_head s_fc_dentry_q[2]; /* directory entry updates */ + unsigned int s_fc_bytes; + /* + * Main fast commit lock. This lock protects accesses to the + * following fields: + * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + */ + spinlock_t s_fc_lock; + struct buffer_head *s_fc_bh; + struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + struct ext4_fc_replay_state s_fc_replay_state; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ +#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 +#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) +EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 + +static inline int ext4_forced_shutdown(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); +} + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; /* See file type macros EXT4_FT_* below */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + +struct ext4_li_request { + struct super_block *lr_super; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group, + bool ignore_locked); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#if IS_ENABLED(CONFIG_UNICODE) +extern int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname); +#endif + +/* ext4 encryption related stuff goes here crypto.c */ +#ifdef CONFIG_FS_ENCRYPTION +extern const struct fscrypt_operations ext4_cryptops; + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname); + +void ext4_fname_free_filename(struct ext4_filename *fname); + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); + +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + int err = 0; + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, iname, fname); +#endif + + return err; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#if IS_ENABLED(CONFIG_UNICODE) + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} + +static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern int ext4_mark_inode_used(struct super_block *sb, int ino); +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, + struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ + (goal), (owner), i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* fast_commit.c */ +int ext4_fc_info_show(struct seq_file *seq, void *v); +void ext4_fc_init(struct super_block *sb, journal_t *journal); +void ext4_fc_init_inode(struct inode *inode); +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); +void ext4_fc_start_update(struct inode *inode); +void ext4_fc_stop_update(struct inode *inode); +void ext4_fc_del(struct inode *inode); +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); +void ext4_fc_replay_cleanup(struct super_block *sb); +int ext4_fc_commit(journal_t *journal, tid_t commit_tid); +int __init ext4_fc_init_dentry_cache(void); +void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} + +/* inode.c */ +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, struct inode *inode, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ + EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, + struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); +extern int ext4_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern void ext4_set_inode_flags(struct inode *, bool init); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +int ext4_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern void ext4_reset_inode_seed(struct inode *inode); +int ext4_update_overhead(struct super_block *sb, bool force); +int ext4_force_shutdown(struct super_block *sb, u32 flags); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode); +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); +extern unsigned int ext4_list_backups(struct super_block *sb, + unsigned int *three, unsigned int *five, + unsigned int *seven); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, blk_opf_t op_flags); +extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block); +extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); +extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); + +extern __printf(7, 8) +void __ext4_error(struct super_block *, const char *, unsigned int, bool, + int, __u64, const char *, ...); +extern __printf(6, 7) +void __ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, int, const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) + +#define ext4_error_inode_block(inode, block, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ + (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#define ext4_abort(sb, err, fmt, a...) \ + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ + __ext4_error_inode((inode), (func), (line), (block), \ + (err), (fmt), ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_error_err(sb, err, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, 0, " "); \ +} while (0) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, err, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, 0, 0, " "); \ +} while (0) +#define ext4_error_err(sb, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, err, 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +#define ext4_read_incompat_64bit_val(es, name) \ + (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ + ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ + le32_to_cpu(es->name##_lo)) + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_blocks_count); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_r_blocks_count); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_free_blocks_count); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; +#ifdef AGGRESSIVE_CHECK + unsigned long bb_check_counter; +#endif + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + struct list_head bb_avg_fragment_size_node; + struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +#ifdef CONFIG_QUOTA +static inline bool ext4_quota_capable(struct super_block *sb) +{ + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); +} + +static inline bool ext4_is_quota_journalled(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + return (ext4_has_feature_quota(sb) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); +} +int ext4_enable_quotas(struct super_block *sb); +#endif + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +int ext4_readpage_inline(struct inode *inode, struct folio *folio); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct page **pagep); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct page **pagep, + void **fsdata); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern void *ext4_read_inline_link(struct inode *inode); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); +extern int __ext4_link(struct inode *dir, struct inode *inode, + struct dentry *dentry); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct folio *folio); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); +extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, + ext4_fsblk_t start_blk, unsigned int count); + + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_free_ext_path(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); +extern int ext4_ext_replay_set_iblocks(struct inode *inode); +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk); +extern int ext4_ext_clear_bb(struct inode *inode); + + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* mmp.c */ +extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern int ext4_resize_end(struct super_block *sb, bool update_backups); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; +extern const struct iomap_ops ext4_iomap_report_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_new/extents_status.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..4d8f95b85cecfe61fe78f9a931afdc04d5e03bad --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/extents_status.h @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_new/fast_commit.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/fast_commit.h new file mode 100644 index 0000000000000000000000000000000000000000..2fadb2c4780c89d73cbd948de1eff9344e3350c0 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_new/fast_commit.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __FAST_COMMIT_H__ +#define __FAST_COMMIT_H__ + +/* + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. + */ + +/* Fast commit tags */ +#define EXT4_FC_TAG_ADD_RANGE 0x0001 +#define EXT4_FC_TAG_DEL_RANGE 0x0002 +#define EXT4_FC_TAG_CREAT 0x0003 +#define EXT4_FC_TAG_LINK 0x0004 +#define EXT4_FC_TAG_UNLINK 0x0005 +#define EXT4_FC_TAG_INODE 0x0006 +#define EXT4_FC_TAG_PAD 0x0007 +#define EXT4_FC_TAG_TAIL 0x0008 +#define EXT4_FC_TAG_HEAD 0x0009 + +#define EXT4_FC_SUPPORTED_FEATURES 0x0 + +/* On disk fast commit tlv value structures */ + +/* Fast commit on disk tag length structure */ +struct ext4_fc_tl { + __le16 fc_tag; + __le16 fc_len; +}; + +/* Value structure for tag EXT4_FC_TAG_HEAD. */ +struct ext4_fc_head { + __le32 fc_features; + __le32 fc_tid; +}; + +/* Value structure for EXT4_FC_TAG_ADD_RANGE. */ +struct ext4_fc_add_range { + __le32 fc_ino; + __u8 fc_ex[12]; +}; + +/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */ +struct ext4_fc_del_range { + __le32 fc_ino; + __le32 fc_lblk; + __le32 fc_len; +}; + +/* + * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK + * and EXT4_FC_TAG_UNLINK. + */ +struct ext4_fc_dentry_info { + __le32 fc_parent_ino; + __le32 fc_ino; + __u8 fc_dname[]; +}; + +/* Value structure for EXT4_FC_TAG_INODE. */ +struct ext4_fc_inode { + __le32 fc_ino; + __u8 fc_raw_inode[]; +}; + +/* Value structure for tag EXT4_FC_TAG_TAIL. */ +struct ext4_fc_tail { + __le32 fc_tid; + __le32 fc_crc; +}; + +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + +/* + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: + */ +enum { + EXT4_FC_REASON_XATTR = 0, + EXT4_FC_REASON_CROSS_RENAME, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, + EXT4_FC_REASON_NOMEM, + EXT4_FC_REASON_SWAP_BOOT, + EXT4_FC_REASON_RESIZE, + EXT4_FC_REASON_RENAME_DIR, + EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, + EXT4_FC_REASON_MAX +}; + +#ifdef __KERNEL__ +/* + * In memory list of dentry updates that are performed on the file + * system used by fast commit code. + */ +struct ext4_fc_dentry_update { + int fcd_op; /* Type of update create / unlink / link */ + int fcd_parent; /* Parent inode number */ + int fcd_ino; /* Inode number */ + struct qstr fcd_name; /* Dirent name */ + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct list_head fcd_list; + struct list_head fcd_dilist; +}; + +struct ext4_fc_stats { + unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; + unsigned long fc_num_commits; + unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; + unsigned long fc_numblks; + u64 s_fc_avg_commit_time; +}; + +#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 + +/* + * Physical block regions added to different inodes due to fast commit + * recovery. These are set during the SCAN phase. During the replay phase, + * our allocator excludes these from its allocation. This ensures that + * we don't accidentally allocating a block that is going to be used by + * another inode. + */ +struct ext4_fc_alloc_region { + ext4_lblk_t lblk; + ext4_fsblk_t pblk; + int ino, len; +}; + +/* + * Fast commit replay state. + */ +struct ext4_fc_replay_state { + int fc_replay_num_tags; + int fc_replay_expected_off; + int fc_current_pass; + int fc_cur_tag; + int fc_crc; + struct ext4_fc_alloc_region *fc_regions; + int fc_regions_size, fc_regions_used, fc_regions_valid; + int *fc_modified_inodes; + int fc_modified_inodes_used, fc_modified_inodes_size; +}; + +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) +#endif + +static inline const char *tag2str(__u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "INODE"; + case EXT4_FC_TAG_PAD: + return "PAD"; + case EXT4_FC_TAG_TAIL: + return "TAIL"; + case EXT4_FC_TAG_HEAD: + return "HEAD"; + default: + return "ERROR"; + } +} + +#endif /* __FAST_COMMIT_H__ */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_old/ext4.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..49c9fc9c3132fd3b6699e31794ad6b8973c7798a --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/ext4.h @@ -0,0 +1,3837 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif +#include + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ +#define EXT_DEBUG__ + +/* + * Dynamic printk for controlled extents debugging. + */ +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) +#else +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define ASSERT(assert) \ +do { \ + if (unlikely(!(assert))) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: '%s'\n", \ + __func__, __FILE__, __LINE__, #assert); \ + BUG(); \ + } \ +} while (0) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * For each criteria, mballoc has slightly different way of finding + * the required blocks nad usually, higher the criteria the slower the + * allocation. We start at lower criterias and keep falling back to + * higher ones if we are not able to find any blocks. Lower (earlier) + * criteria are faster. + */ +enum criteria { + /* + * Used when number of blocks needed is a power of 2. This + * doesn't trigger any disk IO except prefetch and is the + * fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most + * suitable group that satisfies goal request. No disk IO + * except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal + * length to the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if + * necessary, to find find_suitable block group. Tries to + * allocate goal length but might trim the request if nothing + * is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates + * those. This is only used in rare cases when + * CR_GOAL_LEN_SLOW also fails to allocate anything. + */ + CR_ANY_FREE, + + /* + * Number of criterias defined. + */ + EXT4_MB_NUM_CRS +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 +/* Large fragment size list lookup succeeded at least once for cr = 0 */ +#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + refcount_t count; /* reference counter */ + struct list_head list_vec; /* list of ext4_io_end_vec */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +/* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ + EXT4_UNRM_FL | \ + EXT4_COMPR_FL | \ + EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_JOURNAL_DATA_FL | \ + EXT4_NOTAIL_FL | \ + EXT4_DIRSYNC_FL | \ + EXT4_TOPDIR_FL | \ + EXT4_EXTENTS_FL | \ + 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ + EXT4_DAX_FL | \ + EXT4_PROJINHERIT_FL | \ + EXT4_CASEFOLD_FL) + +/* User visible flags */ +#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ + EXT4_DIRTY_FL | \ + EXT4_COMPRBLK_FL | \ + EXT4_NOCOMPR_FL | \ + EXT4_ENCRYPT_FL | \ + EXT4_INDEX_FL | \ + EXT4_VERITY_FL | \ + EXT4_INLINE_DATA_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ +/* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); + CHECK_FLAG_VALUE(RESERVED); +} + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Caller is in the atomic contex, find extent if it has been cached */ +#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) +{ + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) +{ + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; +} + +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) + +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) + +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ +do { \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" +#include "fast_commit.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + * I_DATA_SEM_EA - Used for ea_inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, + I_DATA_SEM_EA +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_rwsem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; + + /* Fast commit related info */ + + /* For tracking dentry create updates */ + struct list_head i_fc_dilist; + struct list_head i_fc_list; /* + * inodes that need fast commit + * protected by sbi->s_fc_lock. + */ + + /* Start of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_start; + + /* End of lblk range that needs to be committed in this fast commit */ + ext4_lblk_t i_fc_lblk_len; + + /* Number of ongoing updates on this inode */ + atomic_t i_fc_updates; + + /* Fast commit wait queue for this inode */ + wait_queue_head_t i_fc_wait; + + /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ + struct mutex i_fc_lock; + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot __rcu *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ +#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX_ALWAYS 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void mb_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +#define EXT4_LABEL_MAX 16 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le32 s_reserved[94]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + atomic_t ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned long s_mount_flags; + unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + struct buffer_head *s_mmp_bh; + + /* Journaling */ + struct journal_s *s_journal; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *s_journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *s_system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; + struct list_head *s_mb_avg_fragment_size; + rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + unsigned int s_mb_prefetch; + unsigned int s_mb_prefetch_limit; + unsigned int s_mb_best_avail_max_trim_order; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + atomic_t s_bal_p2_aligned_bad_suggestions; + atomic_t s_bal_goal_fast_bad_suggestions; + atomic_t s_bal_best_avail_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + unsigned long s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + atomic_t s_warning_count; + atomic_t s_msg_count; + + /* Encryption policy for '-o test_dummy_encryption' */ + struct fscrypt_dummy_policy s_dummy_enc_policy; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; + u64 s_dax_part_off; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif + /* Record the errseq of the backing block device */ + errseq_t s_bdev_wb_err; + spinlock_t s_bdev_wb_lock; + + /* Information about errors that happened during this mount */ + spinlock_t s_error_lock; + int s_add_error_count; + int s_first_error_code; + __u32 s_first_error_line; + __u32 s_first_error_ino; + __u64 s_first_error_block; + const char *s_first_error_func; + time64_t s_first_error_time; + int s_last_error_code; + __u32 s_last_error_line; + __u32 s_last_error_ino; + __u64 s_last_error_block; + const char *s_last_error_func; + time64_t s_last_error_time; + /* + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. + */ + struct work_struct s_sb_upd_work; + + /* Ext4 fast commit sub transaction ID */ + atomic_t s_fc_subtid; + + /* + * After commit starts, the main queue gets locked, and the further + * updates get added in the staging queue. + */ +#define FC_Q_MAIN 0 +#define FC_Q_STAGING 1 + struct list_head s_fc_q[2]; /* Inodes staged for fast commit + * that have data changes in them. + */ + struct list_head s_fc_dentry_q[2]; /* directory entry updates */ + unsigned int s_fc_bytes; + /* + * Main fast commit lock. This lock protects accesses to the + * following fields: + * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. + */ + spinlock_t s_fc_lock; + struct buffer_head *s_fc_bh; + struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; +#ifdef CONFIG_EXT4_DEBUG + int s_fc_debug_max_replay; +#endif + struct ext4_fc_replay_state s_fc_replay_state; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * run-time mount flags + */ +enum { + EXT4_MF_MNTDIR_SAMPLED, + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ +}; + +static inline void ext4_set_mount_flag(struct super_block *sb, int bit) +{ + set_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) +{ + clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + +static inline int ext4_test_mount_flag(struct super_block *sb, int bit) +{ + return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); +} + + +/* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +/* + * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes + * incompatible only if fast commit blocks are present in the FS. Since we + * clear the journal (and thus the fast commit blocks), we don't mark FS as + * incompatible. We also have a JBD2 incompat feature, which gets set when + * there are fast commit blocks present in the journal. + */ +#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 +#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) +EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 + +static inline int ext4_forced_shutdown(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); +} + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; /* See file type macros EXT4_FT_* below */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +enum ext4_li_mode { + EXT4_LI_MODE_PREFETCH_BBITMAP, + EXT4_LI_MODE_ITABLE, +}; + +struct ext4_li_request { + struct super_block *lr_super; + enum ext4_li_mode lr_mode; + ext4_group_t lr_first_not_zeroed; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group, + bool ignore_locked); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#if IS_ENABLED(CONFIG_UNICODE) +extern int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname); +#endif + +/* ext4 encryption related stuff goes here crypto.c */ +#ifdef CONFIG_FS_ENCRYPTION +extern const struct fscrypt_operations ext4_cryptops; + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname); + +void ext4_fname_free_filename(struct ext4_filename *fname); + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); + +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + int err = 0; + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, iname, fname); +#endif + + return err; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#if IS_ENABLED(CONFIG_UNICODE) + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} + +static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern int ext4_mark_inode_used(struct super_block *sb, int ino); +extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, + struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ + (goal), (owner), i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* fast_commit.c */ +int ext4_fc_info_show(struct seq_file *seq, void *v); +void ext4_fc_init(struct super_block *sb, journal_t *journal); +void ext4_fc_init_inode(struct inode *inode); +void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void __ext4_fc_track_link(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry); +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); +void ext4_fc_track_inode(handle_t *handle, struct inode *inode); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); +void ext4_fc_start_update(struct inode *inode); +void ext4_fc_stop_update(struct inode *inode); +void ext4_fc_del(struct inode *inode); +bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); +void ext4_fc_replay_cleanup(struct super_block *sb); +int ext4_fc_commit(journal_t *journal, tid_t commit_tid); +int __init ext4_fc_init_dentry_cache(void); +void ext4_fc_destroy_dentry_cache(void); +int ext4_fc_record_regions(struct super_block *sb, int ino, + ext4_lblk_t lblk, ext4_fsblk_t pblk, + int len, int replay); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, + ext4_group_t group, + unsigned int nr, int *cnt); +extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, + unsigned int nr); + +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); +extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} + +/* inode.c */ +void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, struct inode *inode, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ + EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct mnt_idmap *, struct dentry *, + struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); +extern int ext4_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, + struct ext4_iloc *iloc); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern void ext4_set_inode_flags(struct inode *, bool init); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +int ext4_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern void ext4_reset_inode_seed(struct inode *inode); +int ext4_update_overhead(struct super_block *sb, bool force); +int ext4_force_shutdown(struct super_block *sb, u32 flags); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode); +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); +extern unsigned int ext4_list_backups(struct super_block *sb, + unsigned int *three, unsigned int *five, + unsigned int *seven); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, blk_opf_t op_flags); +extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, + sector_t block); +extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail); +extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); +extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); + +extern __printf(7, 8) +void __ext4_error(struct super_block *, const char *, unsigned int, bool, + int, __u64, const char *, ...); +extern __printf(6, 7) +void __ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, int, const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) + +#define ext4_error_inode_block(inode, block, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ + (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#define ext4_abort(sb, err, fmt, a...) \ + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ + __ext4_error_inode((inode), (func), (line), (block), \ + (err), (fmt), ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_error_err(sb, err, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ + ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, 0, " "); \ +} while (0) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, err, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, 0, 0, " "); \ +} while (0) +#define ext4_error_err(sb, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, false, err, 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +#define ext4_read_incompat_64bit_val(es, name) \ + (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ + ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ + le32_to_cpu(es->name##_lo)) + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_blocks_count); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_r_blocks_count); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ext4_read_incompat_64bit_val(es, s_free_blocks_count); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; +#ifdef AGGRESSIVE_CHECK + unsigned long bb_check_counter; +#endif + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + struct list_head bb_avg_fragment_size_node; + struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ + (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +#ifdef CONFIG_QUOTA +static inline bool ext4_quota_capable(struct super_block *sb) +{ + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); +} + +static inline bool ext4_is_quota_journalled(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + return (ext4_has_feature_quota(sb) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); +} +int ext4_enable_quotas(struct super_block *sb); +#endif + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +int ext4_readpage_inline(struct inode *inode, struct folio *folio); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct page **pagep); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct page **pagep, + void **fsdata); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern void *ext4_read_inline_link(struct inode *inode); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); +extern int __ext4_link(struct inode *dir, struct inode *inode, + struct dentry *dentry); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct folio *folio); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); +extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, + ext4_fsblk_t start_blk, unsigned int count); + + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_free_ext_path(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); +extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); +extern int ext4_ext_replay_set_iblocks(struct inode *inode); +extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, + int len, int unwritten, ext4_fsblk_t pblk); +extern int ext4_ext_clear_bb(struct inode *inode); + + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* mmp.c */ +extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern int ext4_resize_end(struct super_block *sb, bool update_backups); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; +extern const struct iomap_ops ext4_iomap_report_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_old/extents_status.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..4d8f95b85cecfe61fe78f9a931afdc04d5e03bad --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/extents_status.h @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_6_6/fs/ext4_old/fast_commit.h b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/fast_commit.h new file mode 100644 index 0000000000000000000000000000000000000000..2fadb2c4780c89d73cbd948de1eff9344e3350c0 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/fs/ext4_old/fast_commit.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __FAST_COMMIT_H__ +#define __FAST_COMMIT_H__ + +/* + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. + */ + +/* Fast commit tags */ +#define EXT4_FC_TAG_ADD_RANGE 0x0001 +#define EXT4_FC_TAG_DEL_RANGE 0x0002 +#define EXT4_FC_TAG_CREAT 0x0003 +#define EXT4_FC_TAG_LINK 0x0004 +#define EXT4_FC_TAG_UNLINK 0x0005 +#define EXT4_FC_TAG_INODE 0x0006 +#define EXT4_FC_TAG_PAD 0x0007 +#define EXT4_FC_TAG_TAIL 0x0008 +#define EXT4_FC_TAG_HEAD 0x0009 + +#define EXT4_FC_SUPPORTED_FEATURES 0x0 + +/* On disk fast commit tlv value structures */ + +/* Fast commit on disk tag length structure */ +struct ext4_fc_tl { + __le16 fc_tag; + __le16 fc_len; +}; + +/* Value structure for tag EXT4_FC_TAG_HEAD. */ +struct ext4_fc_head { + __le32 fc_features; + __le32 fc_tid; +}; + +/* Value structure for EXT4_FC_TAG_ADD_RANGE. */ +struct ext4_fc_add_range { + __le32 fc_ino; + __u8 fc_ex[12]; +}; + +/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */ +struct ext4_fc_del_range { + __le32 fc_ino; + __le32 fc_lblk; + __le32 fc_len; +}; + +/* + * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK + * and EXT4_FC_TAG_UNLINK. + */ +struct ext4_fc_dentry_info { + __le32 fc_parent_ino; + __le32 fc_ino; + __u8 fc_dname[]; +}; + +/* Value structure for EXT4_FC_TAG_INODE. */ +struct ext4_fc_inode { + __le32 fc_ino; + __u8 fc_raw_inode[]; +}; + +/* Value structure for tag EXT4_FC_TAG_TAIL. */ +struct ext4_fc_tail { + __le32 fc_tid; + __le32 fc_crc; +}; + +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + +/* + * Fast commit status codes + */ +enum { + EXT4_FC_STATUS_OK = 0, + EXT4_FC_STATUS_INELIGIBLE, + EXT4_FC_STATUS_SKIPPED, + EXT4_FC_STATUS_FAILED, +}; + +/* + * Fast commit ineligiblity reasons: + */ +enum { + EXT4_FC_REASON_XATTR = 0, + EXT4_FC_REASON_CROSS_RENAME, + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, + EXT4_FC_REASON_NOMEM, + EXT4_FC_REASON_SWAP_BOOT, + EXT4_FC_REASON_RESIZE, + EXT4_FC_REASON_RENAME_DIR, + EXT4_FC_REASON_FALLOC_RANGE, + EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, + EXT4_FC_REASON_MAX +}; + +#ifdef __KERNEL__ +/* + * In memory list of dentry updates that are performed on the file + * system used by fast commit code. + */ +struct ext4_fc_dentry_update { + int fcd_op; /* Type of update create / unlink / link */ + int fcd_parent; /* Parent inode number */ + int fcd_ino; /* Inode number */ + struct qstr fcd_name; /* Dirent name */ + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct list_head fcd_list; + struct list_head fcd_dilist; +}; + +struct ext4_fc_stats { + unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; + unsigned long fc_num_commits; + unsigned long fc_ineligible_commits; + unsigned long fc_failed_commits; + unsigned long fc_skipped_commits; + unsigned long fc_numblks; + u64 s_fc_avg_commit_time; +}; + +#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4 + +/* + * Physical block regions added to different inodes due to fast commit + * recovery. These are set during the SCAN phase. During the replay phase, + * our allocator excludes these from its allocation. This ensures that + * we don't accidentally allocating a block that is going to be used by + * another inode. + */ +struct ext4_fc_alloc_region { + ext4_lblk_t lblk; + ext4_fsblk_t pblk; + int ino, len; +}; + +/* + * Fast commit replay state. + */ +struct ext4_fc_replay_state { + int fc_replay_num_tags; + int fc_replay_expected_off; + int fc_current_pass; + int fc_cur_tag; + int fc_crc; + struct ext4_fc_alloc_region *fc_regions; + int fc_regions_size, fc_regions_used, fc_regions_valid; + int *fc_modified_inodes; + int fc_modified_inodes_used, fc_modified_inodes_size; +}; + +#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) +#endif + +static inline const char *tag2str(__u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "INODE"; + case EXT4_FC_TAG_PAD: + return "PAD"; + case EXT4_FC_TAG_TAIL: + return "TAIL"; + case EXT4_FC_TAG_HEAD: + return "HEAD"; + default: + return "ERROR"; + } +} + +#endif /* __FAST_COMMIT_H__ */ diff --git a/ops/os_stat/os_stat/include_6_6/include/generated/asm-offsets.h b/ops/os_stat/os_stat/include_6_6/include/generated/asm-offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..0162024777ea4c6acd2ae92886d5ace92e6631a9 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/include/generated/asm-offsets.h @@ -0,0 +1,99 @@ +#ifndef __ASM_OFFSETS_H__ +#define __ASM_OFFSETS_H__ +/* + * DO NOT MODIFY. + * + * This file was generated by Kbuild + */ + + +#define KVM_STEAL_TIME_preempted 16 /* offsetof(struct kvm_steal_time, preempted) */ + +#define pt_regs_bx 40 /* offsetof(struct pt_regs, bx) */ +#define pt_regs_cx 88 /* offsetof(struct pt_regs, cx) */ +#define pt_regs_dx 96 /* offsetof(struct pt_regs, dx) */ +#define pt_regs_sp 152 /* offsetof(struct pt_regs, sp) */ +#define pt_regs_bp 32 /* offsetof(struct pt_regs, bp) */ +#define pt_regs_si 104 /* offsetof(struct pt_regs, si) */ +#define pt_regs_di 112 /* offsetof(struct pt_regs, di) */ +#define pt_regs_r8 72 /* offsetof(struct pt_regs, r8) */ +#define pt_regs_r9 64 /* offsetof(struct pt_regs, r9) */ +#define pt_regs_r10 56 /* offsetof(struct pt_regs, r10) */ +#define pt_regs_r11 48 /* offsetof(struct pt_regs, r11) */ +#define pt_regs_r12 24 /* offsetof(struct pt_regs, r12) */ +#define pt_regs_r13 16 /* offsetof(struct pt_regs, r13) */ +#define pt_regs_r14 8 /* offsetof(struct pt_regs, r14) */ +#define pt_regs_r15 0 /* offsetof(struct pt_regs, r15) */ +#define pt_regs_flags 144 /* offsetof(struct pt_regs, flags) */ + +#define saved_context_cr0 200 /* offsetof(struct saved_context, cr0) */ +#define saved_context_cr2 208 /* offsetof(struct saved_context, cr2) */ +#define saved_context_cr3 216 /* offsetof(struct saved_context, cr3) */ +#define saved_context_cr4 224 /* offsetof(struct saved_context, cr4) */ +#define saved_context_gdt_desc 266 /* offsetof(struct saved_context, gdt_desc) */ + + +#define FIXED_stack_canary 40 /* offsetof(struct fixed_percpu_data, stack_canary) */ + + +#define TASK_threadsp 9688 /* offsetof(struct task_struct, thread.sp) */ +#define TASK_stack_canary 2784 /* offsetof(struct task_struct, stack_canary) */ + +#define pbe_address 0 /* offsetof(struct pbe, address) */ +#define pbe_orig_address 8 /* offsetof(struct pbe, orig_address) */ +#define pbe_next 16 /* offsetof(struct pbe, next) */ + +#define IA32_SIGCONTEXT_ax 44 /* offsetof(struct sigcontext_32, ax) */ +#define IA32_SIGCONTEXT_bx 32 /* offsetof(struct sigcontext_32, bx) */ +#define IA32_SIGCONTEXT_cx 40 /* offsetof(struct sigcontext_32, cx) */ +#define IA32_SIGCONTEXT_dx 36 /* offsetof(struct sigcontext_32, dx) */ +#define IA32_SIGCONTEXT_si 20 /* offsetof(struct sigcontext_32, si) */ +#define IA32_SIGCONTEXT_di 16 /* offsetof(struct sigcontext_32, di) */ +#define IA32_SIGCONTEXT_bp 24 /* offsetof(struct sigcontext_32, bp) */ +#define IA32_SIGCONTEXT_sp 28 /* offsetof(struct sigcontext_32, sp) */ +#define IA32_SIGCONTEXT_ip 56 /* offsetof(struct sigcontext_32, ip) */ + +#define IA32_RT_SIGFRAME_sigcontext 164 /* offsetof(struct rt_sigframe_ia32, uc.uc_mcontext) */ + +#define TDX_MODULE_rcx 0 /* offsetof(struct tdx_module_output, rcx) */ +#define TDX_MODULE_rdx 8 /* offsetof(struct tdx_module_output, rdx) */ +#define TDX_MODULE_r8 16 /* offsetof(struct tdx_module_output, r8) */ +#define TDX_MODULE_r9 24 /* offsetof(struct tdx_module_output, r9) */ +#define TDX_MODULE_r10 32 /* offsetof(struct tdx_module_output, r10) */ +#define TDX_MODULE_r11 40 /* offsetof(struct tdx_module_output, r11) */ + +#define TDX_HYPERCALL_r8 0 /* offsetof(struct tdx_hypercall_args, r8) */ +#define TDX_HYPERCALL_r9 8 /* offsetof(struct tdx_hypercall_args, r9) */ +#define TDX_HYPERCALL_r10 16 /* offsetof(struct tdx_hypercall_args, r10) */ +#define TDX_HYPERCALL_r11 24 /* offsetof(struct tdx_hypercall_args, r11) */ +#define TDX_HYPERCALL_r12 32 /* offsetof(struct tdx_hypercall_args, r12) */ +#define TDX_HYPERCALL_r13 40 /* offsetof(struct tdx_hypercall_args, r13) */ +#define TDX_HYPERCALL_r14 48 /* offsetof(struct tdx_hypercall_args, r14) */ +#define TDX_HYPERCALL_r15 56 /* offsetof(struct tdx_hypercall_args, r15) */ +#define TDX_HYPERCALL_rdi 64 /* offsetof(struct tdx_hypercall_args, rdi) */ +#define TDX_HYPERCALL_rsi 72 /* offsetof(struct tdx_hypercall_args, rsi) */ +#define TDX_HYPERCALL_rbx 80 /* offsetof(struct tdx_hypercall_args, rbx) */ +#define TDX_HYPERCALL_rdx 88 /* offsetof(struct tdx_hypercall_args, rdx) */ + +#define BP_scratch 484 /* offsetof(struct boot_params, scratch) */ +#define BP_secure_boot 492 /* offsetof(struct boot_params, secure_boot) */ +#define BP_loadflags 529 /* offsetof(struct boot_params, hdr.loadflags) */ +#define BP_hardware_subarch 572 /* offsetof(struct boot_params, hdr.hardware_subarch) */ +#define BP_version 518 /* offsetof(struct boot_params, hdr.version) */ +#define BP_kernel_alignment 560 /* offsetof(struct boot_params, hdr.kernel_alignment) */ +#define BP_init_size 608 /* offsetof(struct boot_params, hdr.init_size) */ +#define BP_pref_address 600 /* offsetof(struct boot_params, hdr.pref_address) */ + +#define PTREGS_SIZE 168 /* sizeof(struct pt_regs) */ +#define TLB_STATE_user_pcid_flush_mask 22 /* offsetof(struct tlb_state, user_pcid_flush_mask) */ +#define CPU_ENTRY_AREA_entry_stack 4096 /* offsetof(struct cpu_entry_area, entry_stack_page) */ +#define SIZEOF_entry_stack 4096 /* sizeof(struct entry_stack) */ +#define MASK_entry_stack -4096 /* (~(sizeof(struct entry_stack) - 1)) */ +#define TSS_sp0 4 /* offsetof(struct tss_struct, x86_tss.sp0) */ +#define TSS_sp1 12 /* offsetof(struct tss_struct, x86_tss.sp1) */ +#define TSS_sp2 20 /* offsetof(struct tss_struct, x86_tss.sp2) */ +#define X86_top_of_stack 24 /* offsetof(struct pcpu_hot, top_of_stack) */ +#define X86_current_task 0 /* offsetof(struct pcpu_hot, current_task) */ +#define X86_call_depth 16 /* offsetof(struct pcpu_hot, call_depth) */ + +#endif diff --git a/ops/os_stat/os_stat/include_6_6/include/linux/nospec.h b/ops/os_stat/os_stat/include_6_6/include/linux/nospec.h new file mode 100644 index 0000000000000000000000000000000000000000..9f0af4f116d9853b873bced5ea9ab1a4e75f8fa8 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/include/linux/nospec.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H + +#include +#include + +struct task_struct; + +#ifndef barrier_nospec +# define barrier_nospec() do { } while (0) +#endif + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + (typeof(_i)) (_i & _mask); \ +}) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + +#endif /* _LINUX_NOSPEC_H */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/module/internal.h b/ops/os_stat/os_stat/include_6_6/kernel/module/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..c8b7b4dcf7820dcfea57c5ea5003ac2094285855 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/module/internal.h @@ -0,0 +1,406 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * Copyright (C) 2023 Luis Chamberlain + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* + * Use highest 4 bits of sh_entsize to store the mod_mem_type of this + * section. This leaves 28 bits for offset on 32-bit systems, which is + * about 256 MiB (WARN_ON_ONCE if we exceed that). + */ + +#define SH_ENTSIZE_TYPE_BITS 4 +#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS) +#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1) +#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1) + +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +struct kernel_symbol { +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + int value_offset; + int name_offset; + int namespace_offset; +#else + unsigned long value; + const char *name; + const char *namespace; +#endif +}; + +extern struct mutex module_mutex; +extern struct list_head modules; + +extern struct module_attribute *modinfo_attrs[]; +extern size_t modinfo_attrs_count; + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif +#ifdef CONFIG_MODULE_DECOMPRESS +#ifdef CONFIG_MODULE_STATS + unsigned long compressed_len; +#endif + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, +}; + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const s32 *crc; + const struct kernel_symbol *sym; + enum mod_license license; +}; + +int mod_verify_sig(const void *mod, struct load_info *info); +int try_to_force_load(struct module *mod, const char *reason); +bool find_symbol(struct find_symbol_arg *fsa); +struct module *find_module_all(const char *name, size_t len, bool even_unformed); +int cmp_name(const void *name, const void *sym); +long module_get_offset_and_type(struct module *mod, enum mod_mem_type type, + Elf_Shdr *sechdr, unsigned int section); +char *module_flags(struct module *mod, char *buf, bool show_state); +size_t module_flags_taint(unsigned long taints, char *buf); + +char *module_next_tag_pair(char *string, unsigned long *secsize); + +#define for_each_modinfo_entry(entry, info, name) \ + for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) + +static inline void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON_ONCE(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +#ifdef CONFIG_LIVEPATCH +int copy_module_elf(struct module *mod, struct load_info *info); +void free_module_elf(struct module *mod); +#else /* !CONFIG_LIVEPATCH */ +static inline int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static inline void free_module_elf(struct module *mod) { } +#endif /* CONFIG_LIVEPATCH */ + +static inline bool set_livepatch_module(struct module *mod) +{ +#ifdef CONFIG_LIVEPATCH + mod->klp = true; + return true; +#else + return false; +#endif +} + +/** + * enum fail_dup_mod_reason - state at which a duplicate module was detected + * + * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but + * we've determined that another module with the same name is already loaded + * or being processed on our &modules list. This happens on early_mod_check() + * right before layout_and_allocate(). The kernel would have already + * vmalloc()'d space for the entire module through finit_module(). If + * decompression was used two vmap() spaces were used. These failures can + * happen when userspace has not seen the module present on the kernel and + * tries to load the module multiple times at same time. + * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation + * checks and the kernel determines that the module was unique and because + * of this allocated yet another private kernel copy of the module space in + * layout_and_allocate() but after this determined in add_unformed_module() + * that another module with the same name is already loaded or being processed. + * These failures should be mitigated as much as possible and are indicative + * of really fast races in loading modules. Without module decompression + * they waste twice as much vmap space. With module decompression three + * times the module's size vmap space is wasted. + */ +enum fail_dup_mod_reason { + FAIL_DUP_MOD_BECOMING = 0, + FAIL_DUP_MOD_LOAD, +}; + +#ifdef CONFIG_MODULE_DEBUGFS +extern struct dentry *mod_debugfs_root; +#endif + +#ifdef CONFIG_MODULE_STATS + +#define mod_stat_add_long(count, var) atomic_long_add(count, var) +#define mod_stat_inc(name) atomic_inc(name) + +extern atomic_long_t total_mod_size; +extern atomic_long_t total_text_size; +extern atomic_long_t invalid_kread_bytes; +extern atomic_long_t invalid_decompress_bytes; + +extern atomic_t modcount; +extern atomic_t failed_kreads; +extern atomic_t failed_decompress; +struct mod_fail_load { + struct list_head list; + char name[MODULE_NAME_LEN]; + atomic_long_t count; + unsigned long dup_fail_mask; +}; + +int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason); +void mod_stat_bump_invalid(struct load_info *info, int flags); +void mod_stat_bump_becoming(struct load_info *info, int flags); + +#else + +#define mod_stat_add_long(name, var) +#define mod_stat_inc(name) + +static inline int try_add_failed_module(const char *name, + enum fail_dup_mod_reason reason) +{ + return 0; +} + +static inline void mod_stat_bump_invalid(struct load_info *info, int flags) +{ +} + +static inline void mod_stat_bump_becoming(struct load_info *info, int flags) +{ +} + +#endif /* CONFIG_MODULE_STATS */ + +#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS +bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret); +void kmod_dup_request_announce(char *module_name, int ret); +#else +static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) +{ + return false; +} + +static inline void kmod_dup_request_announce(char *module_name, int ret) +{ +} +#endif + +#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING +struct mod_unload_taint { + struct list_head list; + char name[MODULE_NAME_LEN]; + unsigned long taints; + u64 count; +}; + +int try_add_tainted_module(struct module *mod); +void print_unloaded_tainted_modules(void); +#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ +static inline int try_add_tainted_module(struct module *mod) +{ + return 0; +} + +static inline void print_unloaded_tainted_modules(void) +{ +} +#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif + +struct mod_tree_root { +#ifdef CONFIG_MODULES_TREE_LOOKUP + struct latch_tree_root root; +#endif + unsigned long addr_min; + unsigned long addr_max; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + unsigned long data_addr_min; + unsigned long data_addr_max; +#endif +}; + +extern struct mod_tree_root mod_tree; + +#ifdef CONFIG_MODULES_TREE_LOOKUP +void mod_tree_insert(struct module *mod); +void mod_tree_remove_init(struct module *mod); +void mod_tree_remove(struct module *mod); +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree); +#else /* !CONFIG_MODULES_TREE_LOOKUP */ + +static inline void mod_tree_insert(struct module *mod) { } +static inline void mod_tree_remove_init(struct module *mod) { } +static inline void mod_tree_remove(struct module *mod) { } +static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} +#endif /* CONFIG_MODULES_TREE_LOOKUP */ + +void module_enable_ro(const struct module *mod, bool after_init); +void module_enable_nx(const struct module *mod); +void module_enable_x(const struct module *mod); +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod); + +#ifdef CONFIG_MODULE_SIG +int module_sig_check(struct load_info *info, int flags); +#else /* !CONFIG_MODULE_SIG */ +static inline int module_sig_check(struct load_info *info, int flags) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +#ifdef CONFIG_DEBUG_KMEMLEAK +void kmemleak_load_module(const struct module *mod, const struct load_info *info); +#else /* !CONFIG_DEBUG_KMEMLEAK */ +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } +#endif /* CONFIG_DEBUG_KMEMLEAK */ + +#ifdef CONFIG_KALLSYMS +void init_build_id(struct module *mod, const struct load_info *info); +void layout_symtab(struct module *mod, struct load_info *info); +void add_kallsyms(struct module *mod, const struct load_info *info); + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} +#else /* !CONFIG_KALLSYMS */ +static inline void init_build_id(struct module *mod, const struct load_info *info) { } +static inline void layout_symtab(struct module *mod, struct load_info *info) { } +static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } +#endif /* CONFIG_KALLSYMS */ + +#ifdef CONFIG_SYSFS +int mod_sysfs_setup(struct module *mod, const struct load_info *info, + struct kernel_param *kparam, unsigned int num_params); +void mod_sysfs_teardown(struct module *mod); +void init_param_lock(struct module *mod); +#else /* !CONFIG_SYSFS */ +static inline int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline void mod_sysfs_teardown(struct module *mod) { } +static inline void init_param_lock(struct module *mod) { } +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODVERSIONS +int check_version(const struct load_info *info, + const char *symname, struct module *mod, const s32 *crc); +void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, + struct kernel_symbol *ks, struct tracepoint * const *tp); +int check_modstruct_version(const struct load_info *info, struct module *mod); +int same_magic(const char *amagic, const char *bmagic, bool has_crcs); +#else /* !CONFIG_MODVERSIONS */ +static inline int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + return 1; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/autogroup.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/autogroup.h new file mode 100644 index 0000000000000000000000000000000000000000..90d69f2c5eafd2a1d96d42cbc99b6ec3cc6ea3f5 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/autogroup.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + /* + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + extern unsigned int sysctl_sched_autogroup_enabled; + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} + +#endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/cpudeadline.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/cpudeadline.h new file mode 100644 index 0000000000000000000000000000000000000000..0adeda93b5fb56e3086a3a059338ab2cc8fc58ba --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/cpudeadline.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +}; + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); +int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); +void cpudl_cleanup(struct cpudl *cp); +#endif /* CONFIG_SMP */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..d6cba0020064cc7a70a18d762adb3600dfed065f --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/cpupri.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) + +#define CPUPRI_INVALID -1 +#define CPUPRI_NORMAL 0 +/* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask); +int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#endif diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/ext.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/ext.h new file mode 100644 index 0000000000000000000000000000000000000000..27248760f4ccb65e53a1cc49ab905189612b3ba8 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/ext.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_EXEC = WF_EXEC, + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with + * %SCX_ENQ_LAST and %SCX_ENQ_LOCAL flags set. + * + * If the BPF scheduler wants to continue executing the task, + * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. + * If the task gets queued on a different dsq or the BPF side, the BPF + * scheduler is responsible for triggering a follow-up scheduling event. + * Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* + * A hint indicating that it's advisable to enqueue the task on the + * local dsq of the currently selected CPU. Currently used by + * select_cpu_dfl() and together with %SCX_ENQ_LAST. + */ + SCX_ENQ_LOCAL = 1LLU << 42, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +}; + +enum scx_kick_flags { + SCX_KICK_PREEMPT = 1LLU << 0, /* force scheduling on the CPU */ + SCX_KICK_WAIT = 1LLU << 1, /* wait for the CPU to be rescheduled */ +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +#ifdef CONFIG_SCHED_CLASS_EXT + +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +extern const struct sched_class ext_sched_class; +extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops; +extern const struct file_operations sched_ext_fops; +extern unsigned long scx_watchdog_timeout; +extern unsigned long scx_watchdog_timestamp; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +bool task_should_scx(struct task_struct *p); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool scx_can_stop_tick(struct rq *rq); +void init_sched_ext_class(void); + +__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, + const char *fmt, ...); +#define scx_ops_error(fmt, args...) \ + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) + +void __scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active); + +static inline void scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active) +{ + if (!scx_enabled()) + return; +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + __scx_notify_pick_next_task(rq, p, active); +} + +static inline void scx_notify_sched_tick(void) +{ + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = scx_watchdog_timestamp; + if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } +} + +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; + return class; +} + +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) + +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +/* + * SCX requires a balance() call before every pick_next_task() call including + * when waking up from idle. + */ +#define for_balance_class_range(class, prev_class, end_class) \ + for_active_class_range(class, (prev_class) > &ext_sched_class ? \ + &ext_sched_class : (prev_class), (end_class)) + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +#define scx_enabled() false +#define scx_switched_all() false + +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline int scx_check_setscheduler(struct task_struct *p, + int policy) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void init_sched_ext_class(void) {} +static inline void scx_notify_pick_next_task(struct rq *rq, + const struct task_struct *p, + const struct sched_class *active) {} +static inline void scx_notify_sched_tick(void) {} + +#define for_each_active_class for_each_class +#define for_balance_class_range for_class_range + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle); + +static inline void scx_update_idle(struct rq *rq, bool idle) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/features.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..f770168230ae4a09dd0f240957c0c7d749001a50 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/features.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Using the avg_vruntime, do the right thing and preserve lag across + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ +SCHED_FEAT(PLACE_LAG, true) +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +SCHED_FEAT(RUN_TO_PARITY, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Consider buddies to be cache hot, decreases the likeliness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) +SCHED_FEAT(DOUBLE_TICK, false) + +/* + * Decrement CPU capacity based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_CAPACITY, true) + +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) +#endif + +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_PROP, false) +SCHED_FEAT(SIS_UTIL, true) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + +SCHED_FEAT(RT_RUNTIME_SHARE, false) +SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) +SCHED_FEAT(UTIL_EST_FASTUP, true) + +SCHED_FEAT(LATENCY_WARN, false) + +SCHED_FEAT(HZ_BW, true) diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/sched.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..3938a369e1439a4911b7e07cb0b67101aad21ad4 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/sched.h @@ -0,0 +1,3687 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal types and methods: + */ +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../workqueue_internal.h" + +#ifdef CONFIG_CGROUP_SCHED +#include +#include +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include +#endif + +#ifdef CONFIG_PARAVIRT +# include +# include +#endif + +#include + +#include "cpupri.h" +#include "cpudeadline.h" + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + +struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +extern __read_mostly int scheduler_running; + +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern unsigned int sysctl_sched_child_runs_first; + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. + */ +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) +#else +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) + +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE 10 + +/* + * Single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + +static inline int fair_policy(int policy) +{ + return normal_policy(policy) || policy == SCHED_BATCH; +} + +static inline int rt_policy(int policy) +{ + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} + +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +static inline void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff / 8; +} + +/* + * Shifting a value by an exponent greater *or equal* to the size of said value + * is UB; cap at size-1. + */ +#define shr_bound(val, shift) \ + (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) + +/* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, + const struct sched_dl_entity *b) +{ + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +void __dl_clear_params(struct task_struct *p); + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ +struct dl_bw { + raw_spinlock_t lock; + u64 bw; + u64 total_bw; + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern int dl_bw_check_overflow(int cpu); + +#ifdef CONFIG_CGROUP_SCHED + +struct cfs_rq; +struct rt_rq; + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + u64 burst; + u64 runtime_snap; + s64 hierarchical_quota; + + u8 idle; + u8 period_active; + u8 slack_started; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + int nr_burst; + u64 throttled_time; + u64 burst_time; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +/* Task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#endif +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif +#ifdef CONFIG_CGROUPFS + u64 cpuquota_aware; +#endif + struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern bool cfs_task_bw_constrained(struct task_struct *p); + +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_release_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern int sched_group_set_idle(struct task_group *tg, long idle); + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + return 0; +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; +static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } + +#endif /* CONFIG_CGROUP_SCHED */ + +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy = copy; \ + /* \ + * paired with u64_u32_store_copy(), ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val = var; \ + } while (__val != __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val = (val); \ + var = __val; \ + /* \ + * paired with u64_u32_load_copy(), ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy = __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned int nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_nr_running; /* SCHED_IDLE */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + s64 avg_vruntime; + u64 avg_load; + + u64 exec_clock; + u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root_cached tasks_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct sched_entity *next; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; +#ifndef CONFIG_64BIT + u64 last_update_time_copy; +#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + unsigned long runnable_avg; + } removed; + +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + + /* Locally cached copy of our task_group's idle value */ + int idle; + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + + u64 throttled_pelt_idle; +#ifndef CONFIG_64BIT + u64 throttled_pelt_idle_copy; +#endif + u64 throttled_clock; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; + u64 throttled_clock_self; + u64 throttled_clock_self_time; + int throttled; + int throttle_count; + struct list_head throttled_list; +#ifdef CONFIG_SMP + struct list_head throttled_csd_list; +#endif +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_CAN_STOP_TICK = 1 << 0, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head watchdog_list; + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 flags; + bool cpu_released; + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* RT IPI pull logic requires IRQ_WORK */ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +# define HAVE_RT_PUSH_IPI +#endif + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned int rt_nr_migratory; + unsigned int rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; + +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned int rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root_cached root; + + unsigned int dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision whether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned int dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root_cached pushable_dl_tasks_root; +#else + struct dl_bw dl_bw; +#endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). + */ + u64 max_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline void se_update_runnable(struct sched_entity *se) +{ + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_running; +} + +static inline long se_runnable(struct sched_entity *se) +{ + if (entity_is_task(se)) + return !!se->on_rq; + else + return se->runnable_weight; +} + +#else +#define entity_is_task(se) 1 + +static inline void se_update_runnable(struct sched_entity *se) {} + +static inline long se_runnable(struct sched_entity *se) +{ + return !!se->on_rq; +} +#endif + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + + +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member CPUs from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; + + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; + + unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain __rcu *pd; + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +}; + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); + +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); +#endif /* CONFIG_UCLAMP_TASK */ + +struct rq; +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t __lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; +#endif +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; + call_single_data_t nohz_csd; +#endif /* CONFIG_SMP */ + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif + u64 nr_switches; + +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 +#endif + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned int nr_uninterruptible; + + struct task_struct __rcu *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + unsigned int clock_update_flags; + u64 clock; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; + u64 clock_pelt_idle; + u64 clock_idle; +#ifndef CONFIG_64BIT + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; +#endif + + atomic_t nr_iowait; + +#ifdef CONFIG_SCHED_DEBUG + u64 last_seen_need_resched_ns; + int ticks_without_resched; +#endif + +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain __rcu *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct balance_callback *balance_callback; + + unsigned char nohz_idle_balance; + unsigned char idle_balance; + + unsigned long misfit_task_load; + + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; + + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; +#endif + u64 idle_stamp; + u64 avg_idle; + + unsigned long wake_stamp; + u64 wake_avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; +#endif +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; + u64 psi_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + call_single_data_t hrtick_csd; +#endif + struct hrtimer hrtick_timer; + ktime_t hrtick_time; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif + +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state -- careful with sched_core_cpu_deactivate() */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned int core_forceidle_count; + unsigned int core_forceidle_seq; + unsigned int core_forceidle_occupation; + u64 core_forceidle_start; +#endif + + /* Scratch cpumask to be temporarily used under rq_lock */ + cpumask_var_t scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) + call_single_data_t cfsb_csd; + struct list_head cfsb_csd_list; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->migration_disabled; +#else + return false; +#endif +} + +//DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +//#define raw_rq() raw_cpu_ptr(&runqueues) +#define raw_rq() raw_cpu_ptr(runqueues) + +struct sched_group; +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +/* + * Be careful with this function; not for general use. The return value isn't + * stable unless you actually hold a relevant rq->__lock. + */ +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool fi); +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(cpu_rq(cpu), p)) + return true; + } + return false; +} + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} +#endif /* CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) + +#ifdef CONFIG_SCHED_SMT +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +#define task_of(_se) container_of(_se, struct task_struct, se) + +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) +{ + const struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + +extern void update_rq_clock(struct rq *rq); + +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPDATED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_task; +} + +/** + * By default the decay is the default pelt decay period. + * The decay shift can change the decay period in + * multiples of 32. + * Decay shift Decay period(ms) + * 0 32 + * 1 64 + * 2 128 + * 3 256 + * 4 512 + */ +extern int sched_thermal_decay_shift; + +static inline u64 rq_clock_thermal(struct rq *rq) +{ + return rq_clock_task(rq) >> sched_thermal_decay_shift; +} + +static inline void rq_clock_skip_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttling, which is the only time a skip + * request is canceled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +/* + * During cpu offlining and rq wide unthrottling, we can trigger + * an update_rq_clock() for several cfs and rt runqueues (Typically + * when using list_for_each_entry_*) + * rq_clock_start_loop_update() can be called after updating the clock + * once and before iterating over the list to prevent multiple update. + * After the iterative traversal, we need to call rq_clock_stop_loop_update() + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + */ +static inline void rq_clock_start_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + rq->clock_update_flags |= RQCF_ACT_SKIP; +} + +static inline void rq_clock_stop_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_ACT_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +extern struct balance_callback balance_push_callback; + +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); +#endif +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif +} + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irqsave(rq, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irq(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irq(rq); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +DEFINE_LOCK_GUARD_1(rq_lock, struct rq, + rq_lock(_T->lock, &_T->rf), + rq_unlock(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq, + rq_lock_irq(_T->lock, &_T->rf), + rq_unlock_irq(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, + rq_lock_irqsave(_T->lock, &_T->rf), + rq_unlock_irqrestore(_T->lock, &_T->rf), + struct rq_flags rf) + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); +#else +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_SMP + +static inline void +queue_balance_callback(struct rq *rq, + struct balance_callback *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_rq_held(rq); + + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) + return; + + head->func = func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See destroy_sched_domains: call_rcu for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | +static const unsigned int SD_SHARED_CHILD_MASK = +#include +0; +#undef SD_FLAG + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The CPU whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given CPU. + * + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) { + hsd = sd; + continue; + } + + /* + * Stop the search if @flag is known to be shared at lower + * levels. It will not be found further up. + */ + if (flag & SD_SHARED_CHILD_MASK) + break; + } + + return hsd; +} + +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + +//DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); +DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); +//DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); +extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; + +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity + * for a single CPU. + */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[]; /* Balance mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + unsigned int cores; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; + + KABI_RESERVE(1); + KABI_RESERVE(2); + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[]; +}; + +static inline struct cpumask *sched_group_span(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * See build_balance_mask(). + */ +static inline struct cpumask *group_balance_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#ifdef CONFIG_SCHED_DEBUG +void update_sched_domain_debugfs(void); +void dirty_sched_domain_sysctl(int cpu); +#else +static inline void update_sched_domain_debugfs(void) +{ +} +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} +#endif + +extern int sched_update_scaling(void); + +static inline const struct cpumask *task_user_cpus(struct task_struct *p) +{ + if (!p->user_cpus_ptr) + return cpu_possible_mask; /* &init_task.cpus_mask */ + return p->user_cpus_ptr; +} +#endif /* CONFIG_SMP */ + +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else + +static inline void sched_core_account_forceidle(struct rq *rq) {} + +static inline void sched_core_tick(struct rq *rq) {} + +#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + WRITE_ONCE(task_thread_info(p)->cpu, cpu); + p->wake_cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + +#ifdef CONFIG_JUMP_LABEL +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_key_##enabled(key); \ +} + +#include "features.h" +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* SCHED_DEBUG */ + +extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; +} + +/* Wake flags. The first three directly map to some SD flag value */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ + +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ + +#ifdef CONFIG_SMP +static_assert(WF_EXEC == SD_BALANCE_EXEC); +static_assert(WF_FORK == SD_BALANCE_FORK); +static_assert(WF_TTWU == SD_BALANCE_WAKE); +#endif + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 + +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x40 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif +#define ENQUEUE_INITIAL 0x80 + +#define RETRY_TASK ((void *)-1UL) + +enum rq_onoff_reason { + RQ_ONOFF_HOTPLUG, /* CPU is going on/offline */ + RQ_ONOFF_TOPOLOGY, /* sched domain topology update */ +}; + +struct affinity_context { + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; +}; + +struct sched_class { + +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + + struct task_struct *(*pick_next_task)(struct rq *rq); + + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + +#ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + + struct task_struct * (*pick_task)(struct rq *rq); + + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + + void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason); + void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); +#endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); + + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serialized by + * rq->lock. They are however serialized by p->pi_lock. + */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + int newprio); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); + + void (*update_curr)(struct rq *rq); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_change_group)(struct task_struct *p); +#endif + +#ifdef CONFIG_SCHED_CORE + int (*task_is_throttled)(struct task_struct *p, int cpu); +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); +} + +static inline void set_next_task(struct rq *rq, struct task_struct *next) +{ + next->sched_class->set_next_task(rq, next, false); +} + + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class < (_to); class++) + +#define for_each_class(class) \ + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} + +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +extern struct task_struct *pick_next_task_idle(struct rq *rq); + +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 + +#ifdef CONFIG_SMP + +extern void update_group_capacity(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq); + +extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); + +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + lockdep_assert_rq_held(rq); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + if (p->migration_disabled) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); + +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + SCHED_WARN_ON(!rcu_read_lock_held()); + + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +extern void schedule_idle(void); +asmlinkage void schedule_user(void); + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); + +extern void init_sched_dl_class(void); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void __setscheduler_prio(struct task_struct *p, int prio); + +extern void resched_curr(struct rq *rq); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); + +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) +unsigned long to_ratio(u64 period, u64 runtime); + +extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline int sched_tick_offload_init(void) { return 0; } +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } + +#ifdef CONFIG_SMP + if (prev_nr < 2 && rq->nr_running >= 2) { + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); + } +#endif + + sched_update_tick_dependency(rq); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, -count); + } + + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); +} + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + +#ifdef CONFIG_PREEMPT_RT +#define SCHED_NR_MIGRATE_BREAK 8 +#else +#define SCHED_NR_MIGRATE_BREAK 32 +#endif + +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +extern unsigned int sysctl_sched_base_slice; + +#ifdef CONFIG_SCHED_DEBUG +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; +#endif + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif + +#ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ +static __always_inline +unsigned long arch_scale_freq_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif + +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ + _lock; return _t; } + +#ifdef CONFIG_SMP + +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); + +#ifdef CONFIG_PREEMPTION + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); + return 0; + } + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); + return 0; + } + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#endif /* CONFIG_PREEMPTION */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + lockdep_assert_irqs_disabled(); + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); +} + +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + raw_spin_unlock(l1); + raw_spin_unlock(l2); +} + +DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, + double_raw_lock(_T->lock, _T->lock2), + double_raw_unlock(_T->lock, _T->lock2)) + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); + else + __release(rq2->lock); + raw_spin_rq_unlock(rq1); +} + +extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason); +extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason); +extern bool sched_smp_initialized; + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_lock(rq1); + __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_unlock(rq1); + __release(rq2->lock); +} + +#endif + +DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, + double_rq_lock(_T->lock, _T->lock2), + double_rq_unlock(_T->lock, _T->lock2)) + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_verbose; + +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +extern void resched_latency_warn(int cpu, u64 latency); +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#else +static inline void resched_latency_warn(int cpu, u64 latency) {} +#endif /* CONFIG_SCHED_DEBUG */ + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 +#define NOHZ_NEWILB_KICK_BIT 2 +#define NOHZ_NEXT_KICK_BIT 3 + +/* Run rebalance_domains() */ +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +/* Update blocked load */ +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) +/* Update blocked load when entering idle */ +#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT) +/* Update nohz.next_balance */ +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(struct rq *rq); +#else +static inline void nohz_balance_exit_idle(struct rq *rq) { } +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void nohz_run_idle_balance(int cpu); +#else +static inline void nohz_run_idle_balance(int cpu) { } +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +}; + +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime + * and never move forward. + */ +static inline u64 irq_time_read(int cpu) +{ + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). + * + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); + if (data) + data->func(data, rq_clock(rq), flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false +#endif + +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/** + * enum cpu_util_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within effective_cpu_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum cpu_util_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + enum cpu_util_type type, + struct task_struct *p); + +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + + +extern unsigned long cpu_util_cfs(int cpu); +extern unsigned long cpu_util_cfs_boost(int cpu); + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#endif + +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +static inline unsigned long uclamp_rq_get(struct rq *rq, + enum uclamp_id clamp_id) +{ + return READ_ONCE(rq->uclamp[clamp_id].value); +} + +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, + unsigned int value) +{ + WRITE_ONCE(rq->uclamp[clamp_id].value, value); +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return rq->uclamp_flags & UCLAMP_FLAG_IDLE; +} + +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + unsigned long min_util = 0; + unsigned long max_util = 0; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + if (p) { + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (uclamp_rq_is_idle(rq)) + goto out; + } + + min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)); + max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)); +out: + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned long uclamp_eff_value(struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; +} + +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + return util; +} + +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) +{ + return false; +} + +static inline unsigned long uclamp_rq_get(struct rq *rq, + enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, + unsigned int value) +{ +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + +#define perf_domain_span(pd) NULL +static inline bool sched_energy_enabled(void) { return false; } + +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif + +extern void swake_up_all_locked(struct swait_queue_head *q); +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + +#ifdef CONFIG_PREEMPT_DYNAMIC +extern int preempt_dynamic_mode; +extern int sched_dynamic_mode(const char *str); +extern void sched_dynamic_update(int mode); +#endif + +static inline void update_current_exec_runtime(struct task_struct *curr, + u64 now, u64 delta_exec) +{ + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = now; + cgroup_account_cputime(curr, delta_exec); +} + +#ifdef CONFIG_SCHED_MM_CID + +#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ +#define MM_CID_SCAN_DELAY 100 /* 100ms */ + +extern raw_spinlock_t cid_lock; +extern int use_cid_lock; + +extern void sched_mm_cid_migrate_from(struct task_struct *t); +extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); +extern void init_sched_mm_cid(struct task_struct *t); + +static inline void __mm_cid_put(struct mm_struct *mm, int cid) +{ + if (cid < 0) + return; + cpumask_clear_cpu(cid, mm_cidmask(mm)); +} + +/* + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to + * be held to transition to other states. + * + * State transitions synchronized with cmpxchg or try_cmpxchg need to be + * consistent across cpus, which prevents use of this_cpu_cmpxchg. + */ +static inline void mm_cid_put_lazy(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + if (!mm_cid_is_lazy_put(cid) || + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, res; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + for (;;) { + if (mm_cid_is_unset(cid)) + return MM_CID_UNSET; + /* + * Attempt transition from valid or lazy-put to unset. + */ + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); + if (res == cid) + break; + cid = res; + } + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm) +{ + int cid; + + lockdep_assert_irqs_disabled(); + cid = mm_cid_pcpu_unset(mm); + if (cid == MM_CID_UNSET) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int __mm_cid_try_get(struct mm_struct *mm) +{ + struct cpumask *cpumask; + int cid; + + cpumask = mm_cidmask(mm); + /* + * Retry finding first zero bit if the mask is temporarily + * filled. This only happens during concurrent remote-clear + * which owns a cid without holding a rq lock. + */ + for (;;) { + cid = cpumask_first_zero(cpumask); + if (cid < nr_cpu_ids) + break; + cpu_relax(); + } + if (cpumask_test_and_set_cpu(cid, cpumask)) + return -1; + return cid; +} + +/* + * Save a snapshot of the current runqueue time of this cpu + * with the per-cpu cid value, allowing to estimate how recently it was used. + */ +static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +{ + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + + lockdep_assert_rq_held(rq); + WRITE_ONCE(pcpu_cid->time, rq->clock); +} + +static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +{ + int cid; + + /* + * All allocations (even those using the cid_lock) are lock-free. If + * use_cid_lock is set, hold the cid_lock to perform cid allocation to + * guarantee forward progress. + */ + if (!READ_ONCE(use_cid_lock)) { + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto end; + raw_spin_lock(&cid_lock); + } else { + raw_spin_lock(&cid_lock); + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto unlock; + } + + /* + * cid concurrently allocated. Retry while forcing following + * allocations to use the cid_lock to ensure forward progress. + */ + WRITE_ONCE(use_cid_lock, 1); + /* + * Set use_cid_lock before allocation. Only care about program order + * because this is only required for forward progress. + */ + barrier(); + /* + * Retry until it succeeds. It is guaranteed to eventually succeed once + * all newcoming allocations observe the use_cid_lock flag set. + */ + do { + cid = __mm_cid_try_get(mm); + cpu_relax(); + } while (cid < 0); + /* + * Allocate before clearing use_cid_lock. Only care about + * program order because this is for forward progress. + */ + barrier(); + WRITE_ONCE(use_cid_lock, 0); +unlock: + raw_spin_unlock(&cid_lock); +end: + mm_cid_snapshot_time(rq, mm); + return cid; +} + +static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + struct cpumask *cpumask; + int cid; + + lockdep_assert_rq_held(rq); + cpumask = mm_cidmask(mm); + cid = __this_cpu_read(pcpu_cid->cid); + if (mm_cid_is_valid(cid)) { + mm_cid_snapshot_time(rq, mm); + return cid; + } + if (mm_cid_is_lazy_put(cid)) { + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + } + cid = __mm_cid_get(rq, mm); + __this_cpu_write(pcpu_cid->cid, cid); + return cid; +} + +static inline void switch_mm_cid(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + /* + * Provide a memory barrier between rq->curr store and load of + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. + * + * Should be adapted if context_switch() is modified. + */ + if (!next->mm) { // to kernel + /* + * user -> kernel transition does not guarantee a barrier, but + * we can use the fact that it performs an atomic operation in + * mmgrab(). + */ + if (prev->mm) // from user + smp_mb__after_mmgrab(); + /* + * kernel -> kernel transition does not change rq->curr->mm + * state. It stays NULL. + */ + } else { // to user + /* + * kernel -> user transition does not provide a barrier + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. + * Provide it here. + */ + if (!prev->mm) { // from kernel + smp_mb(); + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + //smp_mb__after_switch_mm(); + } + } + if (prev->mm_cid_active) { + mm_cid_snapshot_time(rq, prev->mm); + mm_cid_put_lazy(prev); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); +} + +#else +static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } +static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } +static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +static inline void init_sched_mm_cid(struct task_struct *t) { } +#endif + +extern u64 avg_vruntime(struct cfs_rq *cfs_rq); +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + +#ifdef CONFIG_CGROUP_SCHED +enum cpu_cftype_id { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + CPU_CFTYPE_WEIGHT, + CPU_CFTYPE_WEIGHT_NICE, + CPU_CFTYPE_IDLE, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + CPU_CFTYPE_MAX, + CPU_CFTYPE_MAX_BURST, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + CPU_CFTYPE_UCLAMP_MIN, + CPU_CFTYPE_UCLAMP_MAX, +#endif + CPU_CFTYPE_CNT, +}; + +extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; +#endif /* CONFIG_CGROUP_SCHED */ + +#include "ext.h" + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/sched/stats.h b/ops/os_stat/os_stat/include_6_6/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..fa1c120f217d4eb0051f46232f887b6adbe0ef75 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/sched/stats.h @@ -0,0 +1,315 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H + +#ifdef CONFIG_CGROUP_SLI +#include +#endif + +#ifdef CONFIG_SCHEDSTATS + +extern struct static_key_false sched_schedstats; + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +static inline void +check_schedstat_required(void) +{ + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); +} + +#else /* !CONFIG_SCHEDSTATS: */ + +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 + +# define __update_stats_wait_start(rq, p, stats) do { } while (0) +# define __update_stats_wait_end(rq, p, stats) do { } while (0) +# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) +# define check_schedstat_required() do { } while (0) + +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + +#ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (static_branch_likely(&psi_disabled)) + return; + + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + + if (!wakeup) { + if (p->in_memstall) + set |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles + * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. + * Do nothing here. + */ + if (sleep) + return; + + psi_task_change(p, p->psi_flags, 0); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (static_branch_likely(&psi_disabled)) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->psi_flags)) { + struct rq_flags rf; + struct rq *rq; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, p->psi_flags, 0); + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + psi_task_switch(prev, next, sleep); +} + +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /* CONFIG_PSI */ + +#ifdef CONFIG_SCHED_INFO +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. + */ +static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = 0; + + if (!t->sched_info.last_queued) + return; + + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; + t->sched_info.run_delay += delta; + + rq_sched_info_dequeue(rq, delta); +} + +/* + * Called when a task finally hits the CPU. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +#ifdef CONFIG_CGROUP_SLI +static void sched_info_arrive(struct rq *rq, struct task_struct *t, struct task_struct *prev) +#else +static void sched_info_arrive(struct rq *rq, struct task_struct *t) +#endif +{ + unsigned long long now, delta = 0; + + if (!t->sched_info.last_queued) + return; + + now = rq_clock(rq); + delta = now - t->sched_info.last_queued; + t->sched_info.last_queued = 0; + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + rq_sched_info_arrive(rq, delta); +#ifdef CONFIG_CGROUP_SLI + sli_schedlat_rundelay(t, prev, delta); +#endif +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeue() will clear that stamp when appropriate. + */ +static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); +} + +/* + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_enqueue() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; + + rq_sched_info_depart(rq, delta); + + if (task_is_running(t)) + sched_info_enqueue(rq, t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + /* + * prev now departs the CPU. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(rq, prev); + + if (next != rq->idle) +#ifdef CONFIG_CGROUP_SLI + sched_info_arrive(rq, next, prev); +#else + sched_info_arrive(rq, next); +#endif +} + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_enqueue(rq, t) do { } while (0) +# define sched_info_dequeue(rq, t) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) +#endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ diff --git a/ops/os_stat/os_stat/include_6_6/kernel/workqueue_internal.h b/ops/os_stat/os_stat/include_6_6/kernel/workqueue_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..498de0e909a438b6bef54e2a471270bc51e9e19c --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/kernel/workqueue_internal.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * kernel/workqueue_internal.h + * + * Workqueue internal header file. Only to be included by workqueue and + * core kernel subsystems. + */ +#ifndef _KERNEL_WORKQUEUE_INTERNAL_H +#define _KERNEL_WORKQUEUE_INTERNAL_H + +#include +#include +#include + +struct worker_pool; + +/* + * The poor guys doing the actual heavy lifting. All on-duty workers are + * either serving the manager role, on idle list or on busy hash. For + * details on the locking annotation (L, I, X...), refer to workqueue.c. + * + * Only to be used in workqueue and async. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; + + struct work_struct *current_work; /* L: work being processed */ + work_func_t current_func; /* L: current_work's fn */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct list_head scheduled; /* L: scheduled works */ + + /* 64 bytes boundary on 64bit, 32 on 32bit */ + + struct task_struct *task; /* I: worker task */ + struct worker_pool *pool; /* A: the associated pool */ + /* L: for rescuers */ + struct list_head node; /* A: anchored at pool->workers */ + /* A: runs through worker->node */ + + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + int sleeping; /* None */ + + /* + * Opaque string set with work_set_desc(). Printed out with task + * dump for debugging - WARN, BUG, panic or sysrq. + */ + char desc[WORKER_DESC_LEN]; + + /* used only by rescuers to point to the target workqueue */ + struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ + + /* used by the scheduler to determine a worker's last known identity */ + work_func_t last_func; +}; + +/** + * current_wq_worker - return struct worker if %current is a workqueue worker + */ +static inline struct worker *current_wq_worker(void) +{ + if (in_task() && (current->flags & PF_WQ_WORKER)) + return kthread_data(current); + return NULL; +} + +/* + * Scheduler hooks for concurrency managed workqueue. Only to be used from + * sched/ and workqueue.c. + */ +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); +work_func_t wq_worker_last_func(struct task_struct *task); + +#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ diff --git a/ops/os_stat/os_stat/include_6_6/mm/slab.h b/ops/os_stat/os_stat/include_6_6/mm/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..62df6eeeb5ead7d70ca28451577eba773751e420 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/mm/slab.h @@ -0,0 +1,898 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ +void __init kmem_cache_init(void); + +#ifdef CONFIG_64BIT +# ifdef system_has_cmpxchg128 +# define system_has_freelist_aba() system_has_cmpxchg128() +# define try_cmpxchg_freelist try_cmpxchg128 +# endif +#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 +typedef u128 freelist_full_t; +#else /* CONFIG_64BIT */ +# ifdef system_has_cmpxchg64 +# define system_has_freelist_aba() system_has_cmpxchg64() +# define try_cmpxchg_freelist try_cmpxchg64 +# endif +#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 +typedef u64 freelist_full_t; +#endif /* CONFIG_64BIT */ + +#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) +#undef system_has_freelist_aba +#endif + +/* + * Freelist pointer and counter to cmpxchg together, avoids the typical ABA + * problems with cmpxchg of just a pointer. + */ +typedef union { + struct { + void *freelist; + unsigned long counter; + }; + freelist_full_t full; +} freelist_aba_t; + +/* Reuses the bits in struct page */ +struct slab { + unsigned long __page_flags; + +#if defined(CONFIG_SLAB) + + struct kmem_cache *slab_cache; + union { + struct { + struct list_head slab_list; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + }; + struct rcu_head rcu_head; + }; + unsigned int active; + +#elif defined(CONFIG_SLUB) + + struct kmem_cache *slab_cache; + union { + struct { + union { + struct list_head slab_list; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + /* Double-word boundary */ + union { + struct { + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + /* + * If slab debugging is enabled then the + * frozen bit can be reused to indicate + * that the slab was corrupted + */ + unsigned frozen:1; + }; + }; + }; +#ifdef system_has_freelist_aba + freelist_aba_t freelist_counter; +#endif + }; + }; + struct rcu_head rcu_head; + }; + unsigned int __unused; + +#else +#error "Unexpected slab allocator configured" +#endif + + atomic_t __page_refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif +}; + +#define SLAB_MATCH(pg, sl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) +SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +SLAB_MATCH(_refcount, __page_refcount); +#ifdef CONFIG_MEMCG +SLAB_MATCH(memcg_data, memcg_data); +#endif +#undef SLAB_MATCH +static_assert(sizeof(struct slab) <= sizeof(struct page)); +#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); +#endif + +/** + * folio_slab - Converts from folio to slab. + * @folio: The folio. + * + * Currently struct slab is a different representation of a folio where + * folio_test_slab() is true. + * + * Return: The slab which contains this folio. + */ +#define folio_slab(folio) (_Generic((folio), \ + const struct folio *: (const struct slab *)(folio), \ + struct folio *: (struct slab *)(folio))) + +/** + * slab_folio - The folio allocated for a slab + * @slab: The slab. + * + * Slabs are allocated as folios that contain the individual objects and are + * using some fields in the first struct page of the folio - those fields are + * now accessed by struct slab. It is occasionally necessary to convert back to + * a folio in order to communicate with the rest of the mm. Please use this + * helper function instead of casting yourself, as the implementation may change + * in the future. + */ +#define slab_folio(s) (_Generic((s), \ + const struct slab *: (const struct folio *)s, \ + struct slab *: (struct folio *)s)) + +/** + * page_slab - Converts from first struct page to slab. + * @p: The first (either head of compound or single) page of slab. + * + * A temporary wrapper to convert struct page to struct slab in situations where + * we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct slab directly or go + * through folio to struct slab. + * + * Return: The slab which contains this page + */ +#define page_slab(p) (_Generic((p), \ + const struct page *: (const struct slab *)(p), \ + struct page *: (struct slab *)(p))) + +/** + * slab_page - The first struct page allocated for a slab + * @slab: The slab. + * + * A convenience wrapper for converting slab to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct slab. + */ +#define slab_page(s) folio_page(slab_folio(s), 0) + +/* + * If network-based swap is enabled, sl*b must keep track of whether pages + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return folio_test_active((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + folio_set_active(slab_folio(slab)); +} + +static inline void slab_clear_pfmemalloc(struct slab *slab) +{ + folio_clear_active(slab_folio(slab)); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __folio_clear_active(slab_folio(slab)); +} + +static inline void *slab_address(const struct slab *slab) +{ + return folio_address(slab_folio(slab)); +} + +static inline int slab_nid(const struct slab *slab) +{ + return folio_nid(slab_folio(slab)); +} + +static inline pg_data_t *slab_pgdat(const struct slab *slab) +{ + return folio_pgdat(slab_folio(slab)); +} + +static inline struct slab *virt_to_slab(const void *addr) +{ + struct folio *folio = virt_to_folio(addr); + + if (!folio_test_slab(folio)) + return NULL; + + return folio_slab(folio); +} + +static inline int slab_order(const struct slab *slab) +{ + return folio_order((struct folio *)slab_folio(slab)); +} + +static inline size_t slab_size(const struct slab *slab) +{ + return PAGE_SIZE << slab_order(slab); +} + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name[NR_KMALLOC_TYPES]; + unsigned int size; +} kmalloc_info[]; + +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(slab_flags_t); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); + +gfp_t kmalloc_fix_flags(gfp_t flags); + +/* Functions provided by the slab allocators */ +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); + +void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, + slab_flags_t flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(unsigned size, unsigned align, + slab_flags_t flags, const char *name, void (*ctor)(void *)); +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)); + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name); + +static inline bool is_kmalloc_cache(struct kmem_cache *s) +{ + return (s->flags & SLAB_KMALLOC); +} + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT | SLAB_NO_MERGE) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) +#else +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT | \ + SLAB_KMALLOC | \ + SLAB_NO_MERGE | \ + SLAB_NO_USER_FLAGS) + +bool __kmem_cache_empty(struct kmem_cache *); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; +} + +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +extern void print_tracking(struct kmem_cache *s, void *object); +long validate_slab_cache(struct kmem_cache *s); +static inline bool __slub_debug_enabled(void) +{ + return static_branch_unlikely(&slub_debug_enabled); +} +#else +static inline void print_tracking(struct kmem_cache *s, void *object) +{ +} +static inline bool __slub_debug_enabled(void) +{ + return false; +} +#endif + +/* + * Returns true if any of the specified slub_debug flags is enabled for the + * cache. Use only for flags parsed by setup_slub_debug() as it also enables + * the static key. + */ +static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) +{ + if (IS_ENABLED(CONFIG_SLUB_DEBUG)) + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); + if (__slub_debug_enabled()) + return s->flags & flags; + return false; +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * slab_objcgs - get the object cgroups vector associated with a slab + * @slab: a pointer to the slab struct + * + * Returns a pointer to the object cgroups vector associated with the slab, + * or NULL if no such vector has been associated yet. + */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + unsigned long memcg_data = READ_ONCE(slab->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), + slab_page(slab)); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr); + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; +} + +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_online()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + + objcg = get_obj_cgroup_from_current(); + if (!objcg) + return true; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + goto out; + } + + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) + goto out; + + *objcgp = objcg; + return true; +out: + obj_cgroup_put(objcg); + return false; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ + struct slab *slab; + unsigned long off; + size_t i; + + if (!memcg_kmem_online() || !objcg) + return; + + for (i = 0; i < size; i++) { + if (likely(p[i])) { + slab = virt_to_slab(p[i]); + + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, + false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } else { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + } + } + obj_cgroup_put(objcg); +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ + struct obj_cgroup **objcgs; + int i; + + if (!memcg_kmem_online()) + return; + + objcgs = slab_objcgs(slab); + if (!objcgs) + return; + + for (i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = objcgs[off]; + if (!objcg) + continue; + + objcgs[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} + +#else /* CONFIG_MEMCG_KMEM */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + return NULL; +} + +static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) +{ + return NULL; +} + +static inline int memcg_alloc_slab_cgroups(struct slab *slab, + struct kmem_cache *s, gfp_t gfp, + bool new_slab) +{ + return 0; +} + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ +} + +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return slab->slab_cache; +} + +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_slab_cgroups(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + if (memcg_kmem_online()) + memcg_free_slab_cgroups(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + +void free_large_kmalloc(struct folio *folio, void *object); + +size_t __ksize(const void *objp); + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (should_failslab(s, flags)) + return NULL; + + if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + return NULL; + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init, + unsigned int orig_size) +{ + unsigned int zero_size = s->object_size; + bool kasan_init = init; + size_t i; + + flags &= gfp_allowed_mask; + + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* + * When slub_debug is enabled, avoid memory initialization integrated + * into KASAN and instead zero out the memory via the memset below with + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and + * cause false-positive reports. This does not lead to a performance + * penalty on production builds, as slub_debug is not intended to be + * enabled there. + */ + if (__slub_debug_enabled()) + kasan_init = false; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); + if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) + memset(p[i], 0, zero_size); + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { +#ifdef CONFIG_SLAB + raw_spinlock_t list_lock; + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + spinlock_t list_lock; + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +void debugfs_slab_release(struct kmem_cache *); +#else +static inline void debugfs_slab_release(struct kmem_cache *s) { } +#endif + +#ifdef CONFIG_PRINTK +#define KS_ADDRS_COUNT 16 +struct kmem_obj_info { + void *kp_ptr; + struct slab *kp_slab; + void *kp_objp; + unsigned long kp_data_offset; + struct kmem_cache *kp_slab_cache; + void *kp_ret; + void *kp_stack[KS_ADDRS_COUNT]; + void *kp_free_stack[KS_ADDRS_COUNT]; +}; +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +#endif + +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user); + +#ifdef CONFIG_SLUB_DEBUG +void skip_orig_size_check(struct kmem_cache *s, const void *object); +#endif + +#endif /* MM_SLAB_H */ diff --git a/ops/os_stat/os_stat/include_6_6/mm/swap.h b/ops/os_stat/os_stat/include_6_6/mm/swap.h new file mode 100644 index 0000000000000000000000000000000000000000..bcfee9e220a089afc5129a11414b3fa870400034 --- /dev/null +++ b/ops/os_stat/os_stat/include_6_6/mm/swap.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_H +#define _MM_SWAP_H + +#ifdef CONFIG_SWAP +#include /* for bio_end_io_t */ + +/* linux/mm/page_io.c */ +int sio_pool_init(void); +struct swap_iocb; +void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); +void __swap_read_unplug(struct swap_iocb *plug); +static inline void swap_read_unplug(struct swap_iocb *plug) +{ + if (unlikely(plug)) + __swap_read_unplug(plug); +} +void swap_write_unplug(struct swap_iocb *sio); +int swap_writepage(struct page *page, struct writeback_control *wbc); +int __swap_writepage(struct page *page, struct writeback_control *wbc); + +/* linux/mm/swap_state.c */ +/* One swap address space for each 64M swap space */ +#define SWAP_ADDRESS_SPACE_SHIFT 14 +#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +extern struct address_space *swapper_spaces[]; +#define swap_address_space(entry) \ + (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ + >> SWAP_ADDRESS_SPACE_SHIFT]) + +void show_swap_cache_info(void); +bool add_to_swap(struct folio *folio); +void *get_shadow_from_swap_cache(swp_entry_t entry); +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadowp); +void __delete_from_swap_cache(struct folio *folio, + swp_entry_t entry, void *shadow); +void delete_from_swap_cache(struct folio *folio); +void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index); + +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + struct swap_iocb **plug); +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + bool *new_page_allocated); +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); +struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); + +static inline unsigned int folio_swap_flags(struct folio *folio) +{ + return page_swap_info(&folio->page)->flags; +} +#else /* CONFIG_SWAP */ +struct swap_iocb; +static inline void swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) +{ +} +static inline void swap_write_unplug(struct swap_iocb *sio) +{ +} + +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return NULL; +} + +static inline void show_swap_cache_info(void) +{ +} + +static inline struct page *swap_cluster_readahead(swp_entry_t entry, + gfp_t gfp_mask, struct vm_fault *vmf) +{ + return NULL; +} + +static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return NULL; +} + +static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +{ + return 0; +} + +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +{ +} + +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} + +static inline +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) +{ + return filemap_get_folio(mapping, index); +} + +static inline bool add_to_swap(struct folio *folio) +{ + return false; +} + +static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +{ + return NULL; +} + +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, + gfp_t gfp_mask, void **shadowp) +{ + return -1; +} + +static inline void __delete_from_swap_cache(struct folio *folio, + swp_entry_t entry, void *shadow) +{ +} + +static inline void delete_from_swap_cache(struct folio *folio) +{ +} + +static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end) +{ +} + +static inline unsigned int folio_swap_flags(struct folio *folio) +{ + return 0; +} +#endif /* CONFIG_SWAP */ +#endif /* _MM_SWAP_H */ diff --git a/ops/os_stat/os_stat/include_private/arch/x86/include/asm/syscall.h b/ops/os_stat/os_stat/include_private/arch/x86/include/asm/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..8db3fdb6102ecb373f085de3a6033c645fbe644c --- /dev/null +++ b/ops/os_stat/os_stat/include_private/arch/x86/include/asm/syscall.h @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include +#include +#include +#include /* For NR_syscalls */ +#include /* for TS_COMPAT */ +#include + +#ifdef CONFIG_X86_64 +typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); +#else +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); +#endif /* CONFIG_X86_64 */ +extern const sys_call_ptr_t sys_call_table[]; + +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + +#ifdef CONFIG_X86_X32_ABI +extern const sys_call_ptr_t x32_sys_call_table[]; +#endif + +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + memcpy(args, ®s->bx, 6 * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(®s->bx + i, args, n * sizeof(args[0])); +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + return AUDIT_ARCH_I386; +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + *args++ = regs->bx; + *args++ = regs->cx; + *args++ = regs->dx; + *args++ = regs->si; + *args++ = regs->di; + *args = regs->bp; + } else +# endif + { + *args++ = regs->di; + *args++ = regs->si; + *args++ = regs->dx; + *args++ = regs->r10; + *args++ = regs->r8; + *args = regs->r9; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return (IS_ENABLED(CONFIG_IA32_EMULATION) && + task->thread_info.status & TS_COMPAT) + ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; +} +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/ops/os_stat/os_stat/include_private/drivers/block/loop.h b/ops/os_stat/os_stat/include_private/drivers/block/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..e34edecf7d50b56105236735c41d4b6c217cf8a0 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/drivers/block/loop.h @@ -0,0 +1,94 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + atomic_t lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct kthread_work work; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *css; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/ops/os_stat/os_stat/include_private/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_private/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..929b1ecd544ee0ffb84973b64867a3dabb8a2f45 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/drivers/target/target_core_file.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#include + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_MAX_DEV_PROT_NAME FD_MAX_DEV_NAME + 16 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 +#define FDBD_HAS_ASYNC_IO 0x08 +#define FDBD_FORMAT_UNIT_SIZE 2048 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + struct file *fd_prot_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_private/fs/ext4_new/ext4.h b/ops/os_stat/os_stat/include_private/fs/ext4_new/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..ca699af28e4de8e1f50a56a198ab9addaa0472a2 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/ext4_new/ext4.h @@ -0,0 +1,3446 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +{ + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec64 *time, + __le32 extra) +{ + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ + } \ + else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_pad[2]; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* + * Flags for ext4_sb_info.s_encoding_flags. + */ +#define EXT4_ENC_STRICT_MODE_FL (1 << 0) + +#define ext4_has_strict_mode(sbi) \ + (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#ifdef CONFIG_UNICODE + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#ifdef CONFIG_UNICODE +extern void ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct fscrypt_str *fname); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); +#endif + return 0; +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +#ifdef CONFIG_UNICODE +extern const struct dentry_operations ext4_dentry_ops; +#endif + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int ext4_ci_compare(const struct inode *parent, + const struct qstr *fname, + const struct qstr *entry, bool quick); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_private/fs/ext4_new/extents_status.h b/ops/os_stat/os_stat/include_private/fs/ext4_new/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..80a62ee17a81d073368dbec55e93c108b413b4fb --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/ext4_new/extents_status.h @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_private/fs/ext4_old/ext4.h b/ops/os_stat/os_stat/include_private/fs/ext4_old/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..f1cc8f7de279a1630d9091092a14e5cfbb3a11a1 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/ext4_old/ext4.h @@ -0,0 +1,3444 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +{ + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec64 *time, + __le32 extra) +{ + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ + } \ + else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_pad[2]; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* + * Flags for ext4_sb_info.s_encoding_flags. + */ +#define EXT4_ENC_STRICT_MODE_FL (1 << 0) + +#define ext4_has_strict_mode(sbi) \ + (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#ifdef CONFIG_UNICODE + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#ifdef CONFIG_UNICODE +extern void ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct fscrypt_str *fname); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); +#endif + return 0; +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +#ifdef CONFIG_UNICODE +extern const struct dentry_operations ext4_dentry_ops; +#endif + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int ext4_ci_compare(const struct inode *parent, + const struct qstr *fname, + const struct qstr *entry, bool quick); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_private/fs/ext4_old/extents_status.h b/ops/os_stat/os_stat/include_private/fs/ext4_old/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..80a62ee17a81d073368dbec55e93c108b413b4fb --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/ext4_old/extents_status.h @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_private/fs/proc/internal.h b/ops/os_stat/os_stat/include_private/fs/proc/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..1d9488e24fc8e3caccade55debd5e76a9e19013a --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/proc/internal.h @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct ctl_table_header; +struct mempolicy; + +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. + */ +struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + refcount_t refcnt; + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; + proc_write_t write; + void *data; + unsigned int state_size; + unsigned int low_ino; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; + char *name; + umode_t mode; + u8 namelen; + char inline_name[]; +} __randomize_layout; + +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) + +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); + const char *lsm; +}; + +struct proc_inode { + struct pid *pid; + unsigned int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct hlist_node sysctl_inodes; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; +} __randomize_layout; + +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} + +static inline struct pid *proc_pid(const struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(const struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +void task_dump_owner(struct task_struct *task, umode_t mode, + kuid_t *ruid, kgid_t *rgid); + +unsigned name_to_int(const struct qstr *qstr); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 + +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int host_pid_info(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +#ifdef CONFIG_DAMON_VADDR +extern int proc_damon_map(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +#endif +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); +extern void pid_update_inode(struct task_struct *, struct inode *); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef struct dentry *instantiate_t(struct dentry *, + struct task_struct *, const void *); +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); +extern int proc_readdir(struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); + +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + refcount_inc(&pde->refcnt); + return pde; +} +extern void pde_put(struct proc_dir_entry *); + +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); + +/* + * inode.c + */ +struct pde_opener { + struct file *file; + struct list_head lh; + bool closing; + struct completion *c; +} __randomize_layout; +extern const struct inode_operations proc_link_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; + +void proc_init_kmemcache(void); +void set_proc_pid_nlink(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern void proc_entry_rundown(struct proc_dir_entry *); + +/* + * proc_namespaces.c + */ +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); +#else +static inline void proc_sys_init(void) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); + +/* + * task_[no]mmu.c + */ +struct mem_size_stats; +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct mm_struct *mm; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +} __randomize_layout; + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +#ifdef CONFIG_MMU +extern void task_mem(struct seq_file *m, struct mm_struct *mm, struct task_struct *task); +#else +extern void task_mem(struct seq_file *, struct mm_struct *); +#endif + +extern const struct dentry_operations proc_net_dentry_ops; +static inline void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} diff --git a/ops/os_stat/os_stat/include_private/fs/xfs/xfs_log_priv.h b/ops/os_stat/os_stat/include_private/fs/xfs/xfs_log_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..b880c23cb6e4ffd78324ff26a2890c0010f67d64 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/xfs/xfs_log_priv.h @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_log is a pointer back to the global log structure. + * - ic_size is the full size of the log buffer, minus the cycle headers. + * - ic_io_size is the size of the currently pending log buffer write, which + * might be smaller than ic_size + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callbacks + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xlog *ic_log; + u32 ic_size; + u32 ic_io_size; + u32 ic_offset; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct list_head ic_callbacks; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +#ifdef DEBUG + bool ic_fail_crc : 1; +#endif + struct semaphore ic_sema; + struct work_struct ic_end_io_work; + struct bio ic_bio; + struct bio_vec ic_bvec[]; +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct list_head iclog_entry; + struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; +#endif + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern void +xlog_recover_cancel(struct xlog *); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = READ_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = READ_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_private/fs/xfs/xfs_trans_priv.h b/ops/os_stat/os_stat/include_private/fs/xfs/xfs_trans_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..2e073c1c4614f2a79cc9452854da1cead65fb06c --- /dev/null +++ b/ops/os_stat/os_stat/include_private/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, bool aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *ail_mount; + struct task_struct *ail_task; + struct list_head ail_head; + xfs_lsn_t ail_target; + xfs_lsn_t ail_target_prev; + struct list_head ail_cursors; + spinlock_t ail_lock; + xfs_lsn_t ail_last_pushed_lsn; + int ail_log_flush; + struct list_head ail_buf_list; + wait_queue_head_t ail_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->ail_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->ail_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->ail_lock); + +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->ail_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->ail_lock); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->ail_lock); + *dst = *src; + spin_unlock(&ailp->ail_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); + lockdep_assert_held(&lip->li_ailp->ail_lock); + + if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->ail_lock); + + if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { + xfs_buf_hold(bp); + lip->li_buf = bp; + } +} + +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_private/include/generated/asm-offsets.h b/ops/os_stat/os_stat/include_private/include/generated/asm-offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..acdc417f455c775e379bdf47a2d9656aaf1e0afe --- /dev/null +++ b/ops/os_stat/os_stat/include_private/include/generated/asm-offsets.h @@ -0,0 +1,87 @@ +#ifndef __ASM_OFFSETS_H__ +#define __ASM_OFFSETS_H__ +/* + * DO NOT MODIFY. + * + * This file was generated by Kbuild + */ + + +#define KVM_STEAL_TIME_preempted 16 /* offsetof(struct kvm_steal_time, preempted) */ + +#define pt_regs_bx 40 /* offsetof(struct pt_regs, bx) */ +#define pt_regs_cx 88 /* offsetof(struct pt_regs, cx) */ +#define pt_regs_dx 96 /* offsetof(struct pt_regs, dx) */ +#define pt_regs_sp 152 /* offsetof(struct pt_regs, sp) */ +#define pt_regs_bp 32 /* offsetof(struct pt_regs, bp) */ +#define pt_regs_si 104 /* offsetof(struct pt_regs, si) */ +#define pt_regs_di 112 /* offsetof(struct pt_regs, di) */ +#define pt_regs_r8 72 /* offsetof(struct pt_regs, r8) */ +#define pt_regs_r9 64 /* offsetof(struct pt_regs, r9) */ +#define pt_regs_r10 56 /* offsetof(struct pt_regs, r10) */ +#define pt_regs_r11 48 /* offsetof(struct pt_regs, r11) */ +#define pt_regs_r12 24 /* offsetof(struct pt_regs, r12) */ +#define pt_regs_r13 16 /* offsetof(struct pt_regs, r13) */ +#define pt_regs_r14 8 /* offsetof(struct pt_regs, r14) */ +#define pt_regs_r15 0 /* offsetof(struct pt_regs, r15) */ +#define pt_regs_flags 144 /* offsetof(struct pt_regs, flags) */ + +#define saved_context_cr0 200 /* offsetof(struct saved_context, cr0) */ +#define saved_context_cr2 208 /* offsetof(struct saved_context, cr2) */ +#define saved_context_cr3 216 /* offsetof(struct saved_context, cr3) */ +#define saved_context_cr4 224 /* offsetof(struct saved_context, cr4) */ +#define saved_context_gdt_desc 266 /* offsetof(struct saved_context, gdt_desc) */ + +#define TSS_ist 36 /* offsetof(struct tss_struct, x86_tss.ist) */ +#define DB_STACK_OFFSET 12288 /* offsetof(struct cea_exception_stacks, DB_stack) - offsetof(struct cea_exception_stacks, DB1_stack) */ + +#define stack_canary_offset 40 /* offsetof(struct fixed_percpu_data, stack_canary) */ + +#define __NR_syscall_max 435 /* sizeof(syscalls_64) - 1 */ +#define NR_syscalls 436 /* sizeof(syscalls_64) */ +#define __NR_syscall_compat_max 435 /* sizeof(syscalls_ia32) - 1 */ +#define IA32_NR_syscalls 436 /* sizeof(syscalls_ia32) */ + +#define TASK_threadsp 9368 /* offsetof(struct task_struct, thread.sp) */ +#define TASK_stack_canary 2680 /* offsetof(struct task_struct, stack_canary) */ + +#define TASK_addr_limit 9496 /* offsetof(struct task_struct, thread.addr_limit) */ + +#define crypto_tfm_ctx_offset 64 /* offsetof(struct crypto_tfm, __crt_ctx) */ + +#define pbe_address 0 /* offsetof(struct pbe, address) */ +#define pbe_orig_address 8 /* offsetof(struct pbe, orig_address) */ +#define pbe_next 16 /* offsetof(struct pbe, next) */ + +#define IA32_SIGCONTEXT_ax 44 /* offsetof(struct sigcontext_32, ax) */ +#define IA32_SIGCONTEXT_bx 32 /* offsetof(struct sigcontext_32, bx) */ +#define IA32_SIGCONTEXT_cx 40 /* offsetof(struct sigcontext_32, cx) */ +#define IA32_SIGCONTEXT_dx 36 /* offsetof(struct sigcontext_32, dx) */ +#define IA32_SIGCONTEXT_si 20 /* offsetof(struct sigcontext_32, si) */ +#define IA32_SIGCONTEXT_di 16 /* offsetof(struct sigcontext_32, di) */ +#define IA32_SIGCONTEXT_bp 24 /* offsetof(struct sigcontext_32, bp) */ +#define IA32_SIGCONTEXT_sp 28 /* offsetof(struct sigcontext_32, sp) */ +#define IA32_SIGCONTEXT_ip 56 /* offsetof(struct sigcontext_32, ip) */ + +#define IA32_RT_SIGFRAME_sigcontext 164 /* offsetof(struct rt_sigframe_ia32, uc.uc_mcontext) */ + +#define BP_scratch 484 /* offsetof(struct boot_params, scratch) */ +#define BP_secure_boot 492 /* offsetof(struct boot_params, secure_boot) */ +#define BP_loadflags 529 /* offsetof(struct boot_params, hdr.loadflags) */ +#define BP_hardware_subarch 572 /* offsetof(struct boot_params, hdr.hardware_subarch) */ +#define BP_version 518 /* offsetof(struct boot_params, hdr.version) */ +#define BP_kernel_alignment 560 /* offsetof(struct boot_params, hdr.kernel_alignment) */ +#define BP_init_size 608 /* offsetof(struct boot_params, hdr.init_size) */ +#define BP_pref_address 600 /* offsetof(struct boot_params, hdr.pref_address) */ +#define BP_code32_start 532 /* offsetof(struct boot_params, hdr.code32_start) */ + +#define PTREGS_SIZE 168 /* sizeof(struct pt_regs) */ +#define TLB_STATE_user_pcid_flush_mask 22 /* offsetof(struct tlb_state, user_pcid_flush_mask) */ +#define CPU_ENTRY_AREA_entry_stack 4096 /* offsetof(struct cpu_entry_area, entry_stack_page) */ +#define SIZEOF_entry_stack 4096 /* sizeof(struct entry_stack) */ +#define MASK_entry_stack -4096 /* (~(sizeof(struct entry_stack) - 1)) */ +#define TSS_sp0 4 /* offsetof(struct tss_struct, x86_tss.sp0) */ +#define TSS_sp1 12 /* offsetof(struct tss_struct, x86_tss.sp1) */ +#define TSS_sp2 20 /* offsetof(struct tss_struct, x86_tss.sp2) */ + +#endif diff --git a/ops/os_stat/os_stat/include_private/include/linux/nospec.h b/ops/os_stat/os_stat/include_private/include/linux/nospec.h new file mode 100644 index 0000000000000000000000000000000000000000..207ef2a20e485d56c30cd478b4ee3ff9a6f67768 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/include/linux/nospec.h @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H +#include + +struct task_struct; + +#ifndef barrier_nospec +# define barrier_nospec() do { } while (0) +#endif + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + (typeof(_i)) (_i & _mask); \ +}) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + +#endif /* _LINUX_NOSPEC_H */ diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/autogroup.h b/ops/os_stat/os_stat/include_private/kernel/sched/autogroup.h new file mode 100644 index 0000000000000000000000000000000000000000..b96419974a1f0e88cf3e0a7b203bde1fcff89b2c --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/autogroup.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + /* + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/cpudeadline.h b/ops/os_stat/os_stat/include_private/kernel/sched/cpudeadline.h new file mode 100644 index 0000000000000000000000000000000000000000..1c1181cd473f25e57790aa2407a55ab7aa54cb5c --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/cpudeadline.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); +int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); +void cpudl_cleanup(struct cpudl *cp); +#endif /* CONFIG_SMP */ diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_private/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..7dc20a3232e726b3b5f91389395f49d7525120a5 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/cpupri.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#endif diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/features.h b/ops/os_stat/os_stat/include_private/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..66c74aa4753e79c04d4c52d96a37525b514983ef --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/features.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) + +/* + * Decrement CPU capacity based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_CAPACITY, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + +SCHED_FEAT(RT_RUNTIME_SHARE, false) +SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/sched.h b/ops/os_stat/os_stat/include_private/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..d89b1dc5bcbb2690586f1a0d98649b880495adf8 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/sched.h @@ -0,0 +1,3207 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal types and methods: + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#include + +#ifdef CONFIG_PARAVIRT +# include +#endif + +#include "cpupri.h" +#include "cpudeadline.h" + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + +struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +extern __read_mostly int scheduler_running; + +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; +#ifdef CONFIG_BT_SCHED +extern atomic_long_t calc_bt_load_tasks; +extern long calc_bt_load_fold_active(struct rq *this_rq, long adjust); +#endif + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Latency nice is meant to provide scheduler hints about the relative + * latency requirements of a task with respect to other tasks. + * Thus a task with latency_nice == 19 can be hinted as the task with no + * latency requirements, in contrast to the task with latency_nice == -20 + * which should be given priority in terms of lower latency. + */ +#define MAX_LATENCY_NICE 19 +#define MIN_LATENCY_NICE -20 + +#define LATENCY_NICE_WIDTH \ + (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) + +/* + * Default tasks should be treated as a task with latency_nice = 0. + */ +#define DEFAULT_LATENCY_NICE 0 +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static latency [ 0..39 ], + * and back. + */ +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT) +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. + */ +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) +#else +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + + +#ifdef CONFIG_BT_SCHED +#define NICE_TO_BT_PRIO(nice) (MAX_RT_PRIO + (nice) + 20 + 40) +#define BT_PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20 - 40) +#define BT_TASK_NICE(p) BT_PRIO_TO_NICE((p)->static_prio) +#endif + +/* BT uses the same nice value range as CFS and also encodes + * its static priority in task_struct's static_prio field + */ +#ifdef CONFIG_BT_SCHED +#define BT_USER_PRIO(p) ((p)-MAX_RT_PRIO-40) +#define BT_TASK_USER_PRIO(p) BT_USER_PRIO((p)->static_prio) +#endif + + +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) + +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE 10 + +/* + * Single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +#ifdef CONFIG_BT_SCHED +static inline int bt_policy(int policy) +{ + if (policy == SCHED_BT) + return 1; + return 0; +} + +static inline int task_has_bt_policy(const struct task_struct *p) +{ + return bt_policy(p->policy); +} + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running - (rq)->bt_nr_running) +#else + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running) +#endif + +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static inline int rt_policy(int policy) +{ + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || +#ifdef CONFIG_BT_SCHED + bt_policy(policy) || +#endif + rt_policy(policy) || dl_policy(policy); +} + +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +void __dl_clear_params(struct task_struct *p); + +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ +struct dl_bw { + raw_spinlock_t lock; + u64 bw; + u64 total_bw; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + +static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); + +#ifdef CONFIG_CGROUP_SCHED + +#include +#include + +struct cfs_rq; +struct rt_rq; +#ifdef CONFIG_BT_SCHED +struct bt_rq; +#endif + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + u64 burst; + u64 buffer; + u64 max_overrun; + u64 runtime_at_period_start; + s64 hierarchical_quota; + + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + int nr_burst; + u64 throttled_time; + u64 burst_time; +#endif +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH + u64 runtime_bt; + u64 bt_suppress_percent; + struct list_head throttled_bt_rq; + u8 idle_bt; + u8 distribute_running_bt; +#endif +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +/* Task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; + int latency_prio; + +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#ifdef CONFIG_HT_ISOLATE + int ht_sensi_type; +#endif +#endif +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + struct sched_bt_entity **bt; + struct bt_rq **bt_rq; + unsigned long bt_shares; + + atomic64_t bt_load_avg; + int offline; + struct mutex offline_mutex; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + u64 cpuquota_aware; + struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +#ifdef CONFIG_BT_GROUP_SCHED +#define ROOT_TASK_GROUP_BT_LOAD NICE_0_LOAD +#define MIN_BT_SHARES (1UL << 1) +#define MAX_BT_SHARES (1UL << 18) +#define CGROUP_BT_PRIORITY 7 +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); + +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_BT_SCHED +extern void free_bt_sched_group(struct task_group *tg); +extern int alloc_bt_sched_group(struct task_group *tg, struct task_group *parent); +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +extern void unregister_bt_sched_group(struct task_group *tg); +extern void init_tg_bt_entry(struct task_group *tg, struct bt_rq *bt_rq, + struct sched_bt_entity *se, int cpu, + struct sched_bt_entity *parent); +extern int sched_bt_can_attach(struct task_group *tg, struct task_struct *tsk); +#endif + +#ifdef CONFIG_BT_GROUP_SCHED +#ifdef CONFIG_BT_BANDWIDTH +extern void online_bt_sched_group(struct task_group *tg); +#endif +#endif + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_latency(struct task_group *tg, long latency); + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +#endif + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + u64 exec_clock; + u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root_cached tasks_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + unsigned long runnable_sum; + } removed; + +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; + int throttled; + int throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_BT_BANDWIDTH +struct bt_bandwidth { + raw_spinlock_t bt_runtime_lock; + ktime_t bt_period; + u64 bt_runtime; + struct hrtimer bt_period_timer; + int timer_active; + unsigned int bt_period_active; +}; +#endif + +#ifdef CONFIG_BT_SCHED +struct bt_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + unsigned long nr_uninterruptible; + + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + struct sched_bt_entity *curr; +#ifdef CONFIG_BT_BANDWIDTH + int bt_throttled; + u64 bt_time; + u64 bt_runtime; + raw_spinlock_t bt_runtime_lock; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; +#endif +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + u64 throttled_clock_bt; + int throttled; + struct list_head throttled_list; +#endif +#ifdef CONFIG_SMP + /* + * BT Load tracking + */ + struct sched_avg_bt avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; + +#ifdef CONFIG_BT_GROUP_SCHED + unsigned long tg_load_avg_contrib; +#endif /* CONFIG_BT_GROUP_SCHED */ + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_BT_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this bt_rq is attached */ + + int on_list; + struct list_head leaf_bt_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ +#endif /* CONFIG_BT_GROUP_SCHED */ +}; +#endif + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* RT IPI pull logic requires IRQ_WORK */ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +# define HAVE_RT_PUSH_IPI +#endif + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; + +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root_cached root; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision whether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root_cached pushable_dl_tasks_root; +#else + struct dl_bw dl_bw; +#endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member CPUs from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; +#ifdef CONFIG_BT_SCHED + int overload_bt; +#endif + + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; + + unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain __rcu *pd; + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); + +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); +#endif /* CONFIG_UCLAMP_TASK */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t __lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_BT_SCHED + unsigned int bt_nr_running; + u64 bt_blocked_clock; +#endif +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; +#endif +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +#ifdef CONFIG_HT_ISOLATE + int core_curr_stat; + int ht_sensi_type; +#endif +#endif /* CONFIG_SMP */ + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ + + unsigned long nr_load_updates; + u64 nr_switches; + +#ifdef CONFIG_BT_SCHED + struct load_weight bt_load; + unsigned long nr_bt_load_updates; +#endif + +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 +#endif + + struct cfs_rq cfs; +#ifdef CONFIG_BT_SCHED + struct bt_rq bt; +#endif + struct rt_rq rt; + struct dl_rq dl; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED + struct list_head leaf_bt_rq_list; +#endif /* CONFIG_BT_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; +#ifdef CONFIG_BT_SCHED + unsigned long last_balance_bt; +#endif + struct mm_struct *prev_mm; + + unsigned int clock_update_flags; + u64 clock; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; + + atomic_t nr_iowait; + +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain __rcu *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; + + unsigned char idle_balance; + + unsigned long misfit_task_load; + + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; +#ifdef CONFIG_BT_SCHED + struct list_head bt_tasks; +#endif + + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif + u64 idle_bt_stamp; + u64 avg_idle_bt; +#ifdef CONFIG_BT_SCHED + u64 idle_stamp; + u64 avg_idle; +#endif + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; +#ifdef CONFIG_BT_SCHED + long calc_bt_load_active; +#endif + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + call_single_data_t hrtick_csd; +#endif + struct hrtimer hrtick_timer; + ktime_t hrtick_time; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_MEM_QOS + struct list_head exp_reclaim_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned char core_forceidle; + unsigned int core_forceidle_seq; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +struct sched_group; +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(rq, p)) + return true; + } + return false; +} + +extern void queue_core_balance(struct rq *rq); + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} + +static inline void queue_core_balance(struct rq *rq) +{ +} +#endif /* CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) + +#ifdef CONFIG_SCHED_SMT +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + +//DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +//#define raw_rq() raw_cpu_ptr(&runqueues) +#define raw_rq() raw_cpu_ptr(runqueues) + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + +extern void update_rq_clock(struct rq *rq); + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return READ_ONCE(rq->clock); +} + +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_task; +} + +static inline void rq_clock_skip_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif +} + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irqsave(rq, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irq(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irq(rq); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_SMP + +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_rq_held(rq); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + +extern void sched_ttwu_pending(void); + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See destroy_sched_domains: call_rcu for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The CPU whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given CPU. + * + * Returns the highest sched_domain of a CPU which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + +//DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); +DECLARE_PER_CPU(int, sd_llc_id); +//DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); +extern struct static_key_false sched_asym_cpucapacity; + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity + * for a single CPU. + */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* Balance mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* CPU of highest priority in group */ + + KABI_RESERVE(1); + KABI_RESERVE(2); + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline struct cpumask *sched_group_span(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * See build_balance_mask(). + */ +static inline struct cpumask *group_balance_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +/** + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_span(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + +#else + +static inline void sched_ttwu_pending(void) { } + +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "autogroup.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) || \ + defined(CONFIG_BT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + p->bt.bt_rq = tg->bt_rq[cpu]; + p->bt.parent = tg->bt[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + WRITE_ONCE(p->cpu, cpu); +#else + WRITE_ONCE(task_thread_info(p)->cpu, cpu); +#endif + p->wake_cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + +#ifdef CONFIG_JUMP_LABEL +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_key_##enabled(key); \ +} + +#include "features.h" +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* SCHED_DEBUG */ + +extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifdef CONFIG_BT_BANDWIDTH +static inline u64 global_bt_period(void) +{ + return (u64)sysctl_sched_bt_period * NSEC_PER_USEC; +} +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; +} + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; +extern const int sched_latency_to_weight[40]; + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 + +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x40 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + +#define RETRY_TASK ((void *)-1UL) + +struct sched_class { + const struct sched_class *next; + +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + + /* + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). + * + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. + */ + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + +#ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + struct task_struct * (*pick_task)(struct rq *rq); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); + + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); + + void (*update_curr)(struct rq *rq); + +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_BT_GROUP_SCHED) + void (*task_change_group)(struct task_struct *p, int type); +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); +} + +static inline void set_next_task(struct rq *rq, struct task_struct *next) +{ + next->sched_class->set_next_task(rq, next, false); +} + +#ifdef CONFIG_SMP +#define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + +#define for_each_class(class) \ + for_class_range(class, sched_class_highest, NULL) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; +#ifdef CONFIG_BT_SCHED +extern const struct sched_class bt_sched_class; +#endif + +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} + +#ifdef CONFIG_SMP + +extern void update_group_capacity(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq); +#ifdef CONFIG_BT_SCHED +extern void trigger_load_balance_bt(struct rq *rq); +#endif + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + +#if defined(CONFIG_BT_GROUP_SCHED) +extern void idle_enter_bt(struct rq *this_rq); +extern void idle_exit_bt(struct rq *this_rq); +#else +static inline void idle_enter_bt(struct rq *this_rq) {} +static inline void idle_exit_bt(struct rq *this_rq) {} +#endif + +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + SCHED_WARN_ON(!rcu_read_lock_held()); + + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +extern void schedule_idle(void); + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); + +extern void init_sched_dl_class(void); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); +#ifdef CONFIG_BT_SCHED +extern void init_sched_bt_class(void); +extern void update_idle_cpu_bt_load(struct rq *this_rq); +extern void init_bt_entity_runnable_average(struct sched_bt_entity *se); +extern void post_init_bt_entity_util_avg(struct sched_bt_entity *se); +#endif + +extern void reweight_task(struct task_struct *p, int prio); + +extern void resched_curr(struct rq *rq); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +#ifdef CONFIG_BT_BANDWIDTH +extern struct bt_bandwidth def_bt_bandwidth; +extern void init_bt_bandwidth(struct bt_bandwidth *bt_b, u64 period, u64 runtime); +#endif + +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH +extern void do_sched_bt_slack_timer(struct cfs_bandwidth *cfs_b); +extern int do_sched_bt_period_timer_share(struct cfs_bandwidth *cfs_b, unsigned long flags); +extern void __refill_cfs_bandwidth_runtime_bt(struct cfs_bandwidth *cfs_b); +extern void unthrottle_bt_rq_share(struct bt_rq *bt_rq); +extern void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b); +extern int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire); +extern bool cfs_bandwidth_used(void); +#endif + +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) +unsigned long to_ratio(u64 period, u64 runtime); + +extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline int sched_tick_offload_init(void) { return 0; } +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + unsigned prev_nr = RQ_CFS_NR_RUNNING(rq); + + rq->nr_running += count; + +#ifdef CONFIG_SMP + if (prev_nr < 2 && RQ_CFS_NR_RUNNING(rq) >= 2) { + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); + } + +#ifdef CONFIG_BT_SCHED + if (rq->bt_nr_running >= 2) { + if (!READ_ONCE(rq->rd->overload_bt)) + WRITE_ONCE(rq->rd->overload_bt, 1); + } +#endif +#endif + + sched_update_tick_dependency(rq); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); +} + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifdef CONFIG_SMP + +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); +#ifdef CONFIG_PREEMPTION + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (__rq_lockp(this_rq) == __rq_lockp(busiest)) + return 0; + + if (likely(raw_spin_rq_trylock(busiest))) + return 0; + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + return 0; + } + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} +#endif /* CONFIG_PREEMPTION */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + lockdep_assert_irqs_disabled(); + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); +} + +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); + else + __release(rq2->lock); + raw_spin_rq_unlock(rq1); +} + +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_rq_lock(rq1); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_rq_unlock(rq1); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); +#ifdef CONFIG_BT_SCHED +extern void init_bt_rq(struct bt_rq *bt_rq); +#endif + +#ifdef CONFIG_SCHED_DEBUG +#ifdef CONFIG_BT_SCHED +extern void print_bt_stats(struct seq_file *m, int cpu); +extern void print_bt_rq(struct seq_file *m, int cpu, struct bt_rq *bt_rq); +#endif +#endif + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 + +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(struct rq *rq); +#else +static inline void nohz_balance_exit_idle(struct rq *rq) { } +#endif + + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +}; + +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ +static inline u64 irq_time_read(int cpu) +{ + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). + * + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); + if (data) + data->func(data, rq_clock(rq), flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef CONFIG_UCLAMP_TASK +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + unsigned int min_util; + unsigned int max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + +#ifdef arch_scale_freq_capacity +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false +#endif + +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; +} + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) +{ + return 0; +} +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + +#define perf_domain_span(pd) NULL +static inline bool sched_energy_enabled(void) { return false; } + +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif diff --git a/ops/os_stat/os_stat/include_private/kernel/sched/stats.h b/ops/os_stat/os_stat/include_private/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..f48e8cb04b036cef4a4a7b3be00a23cda61bc1c6 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/kernel/sched/stats.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +//#include + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} + +static inline void update_schedstat_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_update_avg(var, val) do { update_schedstat_avg(var, val); } while (0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 +# define schedstat_update_avg(var, val) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_PSI_1 +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->in_memstall) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!sleep) { + if (p->in_memstall) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (static_branch_likely(&psi_disabled)) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || p->in_memstall)) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->in_memstall) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (static_branch_likely(&psi_disabled)) + return; + + if (unlikely(rq->curr->in_memstall)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + +#ifdef CONFIG_SCHED_INFO +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. + */ +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (sched_info_on()) { + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + } + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(rq, delta); +} + +/* + * Called when a task finally hits the CPU. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct rq *rq, struct task_struct *t, struct task_struct *prev) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + rq_sched_info_arrive(rq, delta); +#ifdef CONFIG_CGROUP_SLI + sli_schedlat_rundelay(t, prev, delta); +#endif +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +{ + if (sched_info_on()) { + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); + } +} + +/* + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; + + rq_sched_info_depart(rq, delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(rq, t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + /* + * prev now departs the CPU. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(rq, prev); + + if (next != rq->idle) + sched_info_arrive(rq, next, prev); +} + +static inline void +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + if (sched_info_on()) + __sched_info_switch(rq, prev, next); +} + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, t, prev) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) +#endif /* CONFIG_SCHED_INFO */ diff --git a/ops/os_stat/os_stat/include_private/mm/mm_gt_0011/slab.h b/ops/os_stat/os_stat/include_private/mm/mm_gt_0011/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..a6660d06d43f50c42723f57cfe574ffd02ae44d5 --- /dev/null +++ b/ops/os_stat/os_stat/include_private/mm/mm_gt_0011/slab.h @@ -0,0 +1,632 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + slab_flags_t flags; /* Active flags on the slab */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name[NR_KMALLOC_TYPES]; + unsigned int size; +} kmalloc_info[]; + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(slab_flags_t); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); + +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); +extern void create_boot_cache(struct kmem_cache *, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(unsigned size, unsigned align, + slab_flags_t flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)); + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ return NULL; } + +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT) +#else +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT) + +bool __kmem_cache_empty(struct kmem_cache *); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the object listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + +static inline int cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; +} + +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +extern void print_tracking(struct kmem_cache *s, void *object); +#else +static inline void print_tracking(struct kmem_cache *s, void *object) +{ +} +#endif + +/* + * Returns true if any of the specified slub_debug flags is enabled for the + * cache. Use only for flags parsed by setup_slub_debug() as it also enables + * the static key. + */ +static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) +{ +#ifdef CONFIG_SLUB_DEBUG + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); + if (static_branch_unlikely(&slub_debug_enabled)) + return s->flags & flags; +#endif + return false; +} + +#ifdef CONFIG_MEMCG_KMEM +static inline struct obj_cgroup **page_obj_cgroups(struct page *page) +{ + /* + * page->mem_cgroup and page->obj_cgroups are sharing the same + * space. To distinguish between them in case we don't know for sure + * that the page is a slab page (e.g. page_cgroup_ino()), let's + * always set the lowest bit of obj_cgroups. + */ + return (struct obj_cgroup **) + ((unsigned long)page->obj_cgroups & ~0x1UL); +} + +static inline bool page_has_obj_cgroups(struct page *page) +{ + return ((unsigned long)page->obj_cgroups & 0x1UL); +} + +int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, + gfp_t gfp); +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr); + +static inline void memcg_free_page_obj_cgroups(struct page *page) +{ + kfree(page_obj_cgroups(page)); + page->obj_cgroups = NULL; +} + +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + struct obj_cgroup *objcg; + + if (memcg_kmem_bypass()) + return true; + + if (!memcg_kmem_enabled()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + + objcg = get_obj_cgroup_from_current(); + if (!objcg) + return true; + + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) { + obj_cgroup_put(objcg); + return false; + } + + *objcgp = objcg; + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ + struct page *page; + unsigned long off; + size_t i; + + if (!memcg_kmem_enabled() || !objcg) + return; + + for (i = 0; i < size; i++) { + if (likely(p[i])) { + page = virt_to_head_page(p[i]); + + if (!page_has_obj_cgroups(page) && + memcg_alloc_page_obj_cgroups(page, s, flags)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, page, p[i]); + obj_cgroup_get(objcg); + page_obj_cgroups(page)[off] = objcg; + mod_objcg_state(objcg, page_pgdat(page), + cache_vmstat_idx(s), obj_full_size(s)); + } else { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + } + } + obj_cgroup_put(objcg); +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, + void **p, int objects) +{ + struct kmem_cache *s; + struct obj_cgroup *objcg; + struct page *page; + unsigned int off; + int i; + + if (!memcg_kmem_enabled()) + return; + + for (i = 0; i < objects; i++) { + if (unlikely(!p[i])) + continue; + + page = virt_to_head_page(p[i]); + if (!page_has_obj_cgroups(page)) + continue; + + if (!s_orig) + s = page->slab_cache; + else + s = s_orig; + + off = obj_to_index(s, page, p[i]); + objcg = page_obj_cgroups(page)[off]; + if (!objcg) + continue; + + page_obj_cgroups(page)[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} + +#else /* CONFIG_MEMCG_KMEM */ +static inline bool page_has_obj_cgroups(struct page *page) +{ + return false; +} + +static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) +{ + return NULL; +} + +static inline int memcg_alloc_page_obj_cgroups(struct page *page, + struct kmem_cache *s, gfp_t gfp) +{ + return 0; +} + +static inline void memcg_free_page_obj_cgroups(struct page *page) +{ +} + +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, + void **p, int objects) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page; + + page = virt_to_head_page(obj); + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return page->slab_cache; +} + +static __always_inline void charge_slab_page(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void uncharge_slab_page(struct page *page, int order, + struct kmem_cache *s) +{ + if (memcg_kmem_enabled()) + memcg_free_page_obj_cgroups(page); + + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags)) + return NULL; + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +void *slab_start(struct seq_file *m, loff_t *pos); +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + +#endif /* MM_SLAB_H */ diff --git a/ops/os_stat/os_stat/include_private/mm/mm_le_0011/slab.h b/ops/os_stat/os_stat/include_private/mm/mm_le_0011/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..b2b01694dc43f63259fa3ee1e1b59ef3505f794b --- /dev/null +++ b/ops/os_stat/os_stat/include_private/mm/mm_le_0011/slab.h @@ -0,0 +1,694 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + slab_flags_t flags; /* Active flags on the slab */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#else /* !CONFIG_SLOB */ + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + +/* + * This is the main placeholder for memcg-related information in kmem caches. + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. To allow the + * array to be accessed without taking any locks, on relocation we free the old + * version only after a grace period. + * + * Root and child caches hold different metadata. + * + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. + * + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. + */ +struct memcg_cache_params { + struct kmem_cache *root_cache; + union { + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + bool dying; + }; + struct { + struct mem_cgroup *memcg; + struct list_head children_node; + struct list_head kmem_caches_node; + struct percpu_ref refcnt; + + void (*work_fn)(struct kmem_cache *); + union { + struct rcu_head rcu_head; + struct work_struct work; + }; + }; + }; +}; +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned int size; +} kmalloc_info[]; + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(slab_flags_t); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); + +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); +extern void create_boot_cache(struct kmem_cache *, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(unsigned size, unsigned align, + slab_flags_t flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)); + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ return NULL; } + +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT) + +bool __kmem_cache_empty(struct kmem_cache *); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); +void slab_kmem_cache_release(struct kmem_cache *); +void kmem_cache_shrink_all(struct kmem_cache *s); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the object listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + +static inline int cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; +} + +#ifdef CONFIG_MEMCG_KMEM + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params.root_cache; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return p == s || p == s->memcg_params.root_cache; +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + s = s->memcg_params.root_cache; + return s->name; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params.root_cache; +} + +/* + * Expects a pointer to a slab page. Please note, that PageSlab() check + * isn't sufficient, as it returns true also for tail compound slab pages, + * which do not have slab_cache pointer set. + * So this function assumes that the page can pass PageSlab() && !PageTail() + * check. + * + * The kmem_cache can be reparented asynchronously. The caller must ensure + * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. + */ +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + struct kmem_cache *s; + + s = READ_ONCE(page->slab_cache); + if (s && !is_root_cache(s)) + return READ_ONCE(s->memcg_params.memcg); + + return NULL; +} + +/* + * Charge the slab page belonging to the non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + int ret; + + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + while (memcg && !css_tryget_online(&memcg->css)) + memcg = parent_mem_cgroup(memcg); + rcu_read_unlock(); + + if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + (1 << order)); + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + return 0; + } + + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (ret) + goto out; + + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); + + /* transer try_charge() page references to kmem_cache */ + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + css_put_many(&memcg->css, 1 << order); +out: + css_put(&memcg->css); + return ret; +} + +/* + * Uncharge a slab page belonging to a non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + if (likely(!mem_cgroup_is_root(memcg))) { + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + } else { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + } + rcu_read_unlock(); + + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); +} + +extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); + +#else /* CONFIG_MEMCG_KMEM */ + +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return s == p; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} + +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + return NULL; +} + +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} + +static inline void memcg_link_cache(struct kmem_cache *s, + struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_MEMCG_KMEM */ + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page; + + page = virt_to_head_page(obj); + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return page->slab_cache; +} + +static __always_inline int charge_slab_page(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + 1 << order); + return 0; + } + + return memcg_charge_slab(page, gfp, order, s); +} + +static __always_inline void uncharge_slab_page(struct page *page, int order, + struct kmem_cache *s) +{ + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + return; + } + + memcg_uncharge_slab(page, order, s); +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && + !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); + return cachep; +} + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + if (memcg_kmem_enabled() && + ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) + return memcg_kmem_get_cache(s); + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + } + + if (memcg_kmem_enabled()) + memcg_kmem_put_cache(s); +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +void *slab_start(struct seq_file *m, loff_t *pos); +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + +#endif /* MM_SLAB_H */ diff --git a/ops/os_stat/os_stat/include_pub/arch/x86/include/asm/syscall.h b/ops/os_stat/os_stat/include_pub/arch/x86/include/asm/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..8db3fdb6102ecb373f085de3a6033c645fbe644c --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/arch/x86/include/asm/syscall.h @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include +#include +#include +#include /* For NR_syscalls */ +#include /* for TS_COMPAT */ +#include + +#ifdef CONFIG_X86_64 +typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *); +#else +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); +#endif /* CONFIG_X86_64 */ +extern const sys_call_ptr_t sys_call_table[]; + +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + +#ifdef CONFIG_X86_X32_ABI +extern const sys_call_ptr_t x32_sys_call_table[]; +#endif + +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + memcpy(args, ®s->bx, 6 * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(®s->bx + i, args, n * sizeof(args[0])); +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + return AUDIT_ARCH_I386; +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + *args++ = regs->bx; + *args++ = regs->cx; + *args++ = regs->dx; + *args++ = regs->si; + *args++ = regs->di; + *args = regs->bp; + } else +# endif + { + *args++ = regs->di; + *args++ = regs->si; + *args++ = regs->dx; + *args++ = regs->r10; + *args++ = regs->r8; + *args = regs->r9; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + +static inline int syscall_get_arch(struct task_struct *task) +{ + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return (IS_ENABLED(CONFIG_IA32_EMULATION) && + task->thread_info.status & TS_COMPAT) + ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; +} +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/ops/os_stat/os_stat/include_pub/drivers/block/loop.h b/ops/os_stat/os_stat/include_pub/drivers/block/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..ef4ec2b55bc8638daa370facabf7114b54027b47 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/drivers/block/loop.h @@ -0,0 +1,94 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + atomic_t lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct kthread_work work; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *css; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/ops/os_stat/os_stat/include_pub/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_pub/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..929b1ecd544ee0ffb84973b64867a3dabb8a2f45 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/drivers/target/target_core_file.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#include + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_MAX_DEV_PROT_NAME FD_MAX_DEV_NAME + 16 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 +#define FDBD_HAS_ASYNC_IO 0x08 +#define FDBD_FORMAT_UNIT_SIZE 2048 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + struct file *fd_prot_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_pub/include/generated/asm-offsets.h b/ops/os_stat/os_stat/include_pub/include/generated/asm-offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..acdc417f455c775e379bdf47a2d9656aaf1e0afe --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/include/generated/asm-offsets.h @@ -0,0 +1,87 @@ +#ifndef __ASM_OFFSETS_H__ +#define __ASM_OFFSETS_H__ +/* + * DO NOT MODIFY. + * + * This file was generated by Kbuild + */ + + +#define KVM_STEAL_TIME_preempted 16 /* offsetof(struct kvm_steal_time, preempted) */ + +#define pt_regs_bx 40 /* offsetof(struct pt_regs, bx) */ +#define pt_regs_cx 88 /* offsetof(struct pt_regs, cx) */ +#define pt_regs_dx 96 /* offsetof(struct pt_regs, dx) */ +#define pt_regs_sp 152 /* offsetof(struct pt_regs, sp) */ +#define pt_regs_bp 32 /* offsetof(struct pt_regs, bp) */ +#define pt_regs_si 104 /* offsetof(struct pt_regs, si) */ +#define pt_regs_di 112 /* offsetof(struct pt_regs, di) */ +#define pt_regs_r8 72 /* offsetof(struct pt_regs, r8) */ +#define pt_regs_r9 64 /* offsetof(struct pt_regs, r9) */ +#define pt_regs_r10 56 /* offsetof(struct pt_regs, r10) */ +#define pt_regs_r11 48 /* offsetof(struct pt_regs, r11) */ +#define pt_regs_r12 24 /* offsetof(struct pt_regs, r12) */ +#define pt_regs_r13 16 /* offsetof(struct pt_regs, r13) */ +#define pt_regs_r14 8 /* offsetof(struct pt_regs, r14) */ +#define pt_regs_r15 0 /* offsetof(struct pt_regs, r15) */ +#define pt_regs_flags 144 /* offsetof(struct pt_regs, flags) */ + +#define saved_context_cr0 200 /* offsetof(struct saved_context, cr0) */ +#define saved_context_cr2 208 /* offsetof(struct saved_context, cr2) */ +#define saved_context_cr3 216 /* offsetof(struct saved_context, cr3) */ +#define saved_context_cr4 224 /* offsetof(struct saved_context, cr4) */ +#define saved_context_gdt_desc 266 /* offsetof(struct saved_context, gdt_desc) */ + +#define TSS_ist 36 /* offsetof(struct tss_struct, x86_tss.ist) */ +#define DB_STACK_OFFSET 12288 /* offsetof(struct cea_exception_stacks, DB_stack) - offsetof(struct cea_exception_stacks, DB1_stack) */ + +#define stack_canary_offset 40 /* offsetof(struct fixed_percpu_data, stack_canary) */ + +#define __NR_syscall_max 435 /* sizeof(syscalls_64) - 1 */ +#define NR_syscalls 436 /* sizeof(syscalls_64) */ +#define __NR_syscall_compat_max 435 /* sizeof(syscalls_ia32) - 1 */ +#define IA32_NR_syscalls 436 /* sizeof(syscalls_ia32) */ + +#define TASK_threadsp 9368 /* offsetof(struct task_struct, thread.sp) */ +#define TASK_stack_canary 2680 /* offsetof(struct task_struct, stack_canary) */ + +#define TASK_addr_limit 9496 /* offsetof(struct task_struct, thread.addr_limit) */ + +#define crypto_tfm_ctx_offset 64 /* offsetof(struct crypto_tfm, __crt_ctx) */ + +#define pbe_address 0 /* offsetof(struct pbe, address) */ +#define pbe_orig_address 8 /* offsetof(struct pbe, orig_address) */ +#define pbe_next 16 /* offsetof(struct pbe, next) */ + +#define IA32_SIGCONTEXT_ax 44 /* offsetof(struct sigcontext_32, ax) */ +#define IA32_SIGCONTEXT_bx 32 /* offsetof(struct sigcontext_32, bx) */ +#define IA32_SIGCONTEXT_cx 40 /* offsetof(struct sigcontext_32, cx) */ +#define IA32_SIGCONTEXT_dx 36 /* offsetof(struct sigcontext_32, dx) */ +#define IA32_SIGCONTEXT_si 20 /* offsetof(struct sigcontext_32, si) */ +#define IA32_SIGCONTEXT_di 16 /* offsetof(struct sigcontext_32, di) */ +#define IA32_SIGCONTEXT_bp 24 /* offsetof(struct sigcontext_32, bp) */ +#define IA32_SIGCONTEXT_sp 28 /* offsetof(struct sigcontext_32, sp) */ +#define IA32_SIGCONTEXT_ip 56 /* offsetof(struct sigcontext_32, ip) */ + +#define IA32_RT_SIGFRAME_sigcontext 164 /* offsetof(struct rt_sigframe_ia32, uc.uc_mcontext) */ + +#define BP_scratch 484 /* offsetof(struct boot_params, scratch) */ +#define BP_secure_boot 492 /* offsetof(struct boot_params, secure_boot) */ +#define BP_loadflags 529 /* offsetof(struct boot_params, hdr.loadflags) */ +#define BP_hardware_subarch 572 /* offsetof(struct boot_params, hdr.hardware_subarch) */ +#define BP_version 518 /* offsetof(struct boot_params, hdr.version) */ +#define BP_kernel_alignment 560 /* offsetof(struct boot_params, hdr.kernel_alignment) */ +#define BP_init_size 608 /* offsetof(struct boot_params, hdr.init_size) */ +#define BP_pref_address 600 /* offsetof(struct boot_params, hdr.pref_address) */ +#define BP_code32_start 532 /* offsetof(struct boot_params, hdr.code32_start) */ + +#define PTREGS_SIZE 168 /* sizeof(struct pt_regs) */ +#define TLB_STATE_user_pcid_flush_mask 22 /* offsetof(struct tlb_state, user_pcid_flush_mask) */ +#define CPU_ENTRY_AREA_entry_stack 4096 /* offsetof(struct cpu_entry_area, entry_stack_page) */ +#define SIZEOF_entry_stack 4096 /* sizeof(struct entry_stack) */ +#define MASK_entry_stack -4096 /* (~(sizeof(struct entry_stack) - 1)) */ +#define TSS_sp0 4 /* offsetof(struct tss_struct, x86_tss.sp0) */ +#define TSS_sp1 12 /* offsetof(struct tss_struct, x86_tss.sp1) */ +#define TSS_sp2 20 /* offsetof(struct tss_struct, x86_tss.sp2) */ + +#endif diff --git a/ops/os_stat/os_stat/include_pub/include/linux/nospec.h b/ops/os_stat/os_stat/include_pub/include/linux/nospec.h new file mode 100644 index 0000000000000000000000000000000000000000..0c5ef54fd4162830b55aa676c1ecae4ea6ac23f5 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/include/linux/nospec.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H +#include + +struct task_struct; + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + (typeof(_i)) (_i & _mask); \ +}) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + +#endif /* _LINUX_NOSPEC_H */ diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/autogroup.h b/ops/os_stat/os_stat/include_pub/kernel/sched/autogroup.h new file mode 100644 index 0000000000000000000000000000000000000000..b96419974a1f0e88cf3e0a7b203bde1fcff89b2c --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/autogroup.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + /* + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/cpudeadline.h b/ops/os_stat/os_stat/include_pub/kernel/sched/cpudeadline.h new file mode 100644 index 0000000000000000000000000000000000000000..0adeda93b5fb56e3086a3a059338ab2cc8fc58ba --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/cpudeadline.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +}; + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); +int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); +void cpudl_cleanup(struct cpudl *cp); +#endif /* CONFIG_SMP */ diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_pub/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..7dc20a3232e726b3b5f91389395f49d7525120a5 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/cpupri.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#endif diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/features.h b/ops/os_stat/os_stat/include_pub/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..2410db5e9a35302856df5db8ce1c65e8634fef80 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/features.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) + +/* + * Decrement CPU capacity based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_CAPACITY, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + +SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/sched.h b/ops/os_stat/os_stat/include_pub/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..10b41af4a8617e17456ff4d7510cac4a04f833cd --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/sched.h @@ -0,0 +1,2576 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal types and methods: + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_PARAVIRT +# include +#endif + +#include "cpupri.h" +#include "cpudeadline.h" + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + +struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +extern __read_mostly int scheduler_running; + +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. + */ +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) +#else +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) + +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE 10 + +/* + * Single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static inline int rt_policy(int policy) +{ + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} + +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; +}; + +void __dl_clear_params(struct task_struct *p); + +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ +struct dl_bw { + raw_spinlock_t lock; + u64 bw; + u64 total_bw; +}; + +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + +static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); + +#ifdef CONFIG_CGROUP_SCHED + +#include +#include + +struct cfs_rq; +struct rt_rq; + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + u64 burst; + u64 buffer; + u64 max_overrun; + u64 runtime_at_period_start; + s64 hierarchical_quota; + + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + int nr_burst; + u64 throttled_time; + u64 burst_time; +#endif +}; + +/* Task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; + +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#endif +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + u64 cpuquota_aware; + struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); + +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root_cached tasks_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + unsigned long runnable_sum; + } removed; + +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_task; + u64 throttled_clock_task_time; + int throttled; + int throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* RT IPI pull logic requires IRQ_WORK */ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +# define HAVE_RT_PUSH_IPI +#endif + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; + +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif +}; + +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root_cached root; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision whether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root_cached pushable_dl_tasks_root; +#else + struct dl_bw dl_bw; +#endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member CPUs from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; + + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; + + unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain __rcu *pd; + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +}; + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); + +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); +#endif /* CONFIG_UCLAMP_TASK */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; +#endif +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +#endif /* CONFIG_SMP */ + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ + + unsigned long nr_load_updates; + u64 nr_switches; + +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 +#endif + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + unsigned int clock_update_flags; + u64 clock; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; + + atomic_t nr_iowait; + +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain __rcu *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; + + unsigned char idle_balance; + + unsigned long misfit_task_load; + + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; + + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif + u64 idle_stamp; + u64 avg_idle; + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + call_single_data_t hrtick_csd; +#endif + struct hrtimer hrtick_timer; + ktime_t hrtick_time; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + + +#ifdef CONFIG_SCHED_SMT +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + +//DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +//#define raw_rq() raw_cpu_ptr(&runqueues) +#define raw_rq() raw_cpu_ptr(runqueues) + +extern void update_rq_clock(struct rq *rq); + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return READ_ONCE(rq->clock); +} + +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_task; +} + +static inline void rq_clock_skip_update(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif +} + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_SMP + +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_held(&rq->lock); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + +extern void sched_ttwu_pending(void); + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See destroy_sched_domains: call_rcu for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The CPU whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given CPU. + * + * Returns the highest sched_domain of a CPU which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + +//DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); +DECLARE_PER_CPU(int, sd_llc_id); +//DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); +extern struct static_key_false sched_asym_cpucapacity; + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity + * for a single CPU. + */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* Balance mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* CPU of highest priority in group */ + + KABI_RESERVE(1); + KABI_RESERVE(2); + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_span(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * See build_balance_mask(). + */ +static inline struct cpumask *group_balance_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +/** + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_span(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + +#else + +static inline void sched_ttwu_pending(void) { } + +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "autogroup.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + WRITE_ONCE(p->cpu, cpu); +#else + WRITE_ONCE(task_thread_info(p)->cpu, cpu); +#endif + p->wake_cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + +#ifdef CONFIG_JUMP_LABEL +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_key_##enabled(key); \ +} + +#include "features.h" +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* SCHED_DEBUG */ + +extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; +} + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 + +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x40 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + +#define RETRY_TASK ((void *)-1UL) + +struct sched_class { + const struct sched_class *next; + +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + + /* + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). + * + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. + */ + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + +#ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); + + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); + + void (*update_curr)(struct rq *rq); + +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_change_group)(struct task_struct *p, int type); +#endif + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +}; + +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); +} + +static inline void set_next_task(struct rq *rq, struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next, false); +} + +#ifdef CONFIG_SMP +#define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + +#define for_each_class(class) \ + for_class_range(class, sched_class_highest, NULL) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} + +#ifdef CONFIG_SMP + +extern void update_group_capacity(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq); + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + SCHED_WARN_ON(!rcu_read_lock_held()); + + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +extern void schedule_idle(void); + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); + +extern void init_sched_dl_class(void); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void reweight_task(struct task_struct *p, int prio); + +extern void resched_curr(struct rq *rq); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) +unsigned long to_ratio(u64 period, u64 runtime); + +extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline int sched_tick_offload_init(void) { return 0; } +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; + +#ifdef CONFIG_SMP + if (prev_nr < 2 && rq->nr_running >= 2) { + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); + } +#endif + + sched_update_tick_dependency(rq); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); +} + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPTION + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPTION */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work well under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 + +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(struct rq *rq); +#else +static inline void nohz_balance_exit_idle(struct rq *rq) { } +#endif + + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +}; + +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ +static inline u64 irq_time_read(int cpu) +{ + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). + * + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); + if (data) + data->func(data, rq_clock(rq), flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef CONFIG_UCLAMP_TASK +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + unsigned int min_util; + unsigned int max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + +#ifdef arch_scale_freq_capacity +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false +#endif + +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; +} + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) +{ + return 0; +} +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + +#define perf_domain_span(pd) NULL +static inline bool sched_energy_enabled(void) { return false; } + +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif diff --git a/ops/os_stat/os_stat/include_pub/kernel/sched/stats.h b/ops/os_stat/os_stat/include_pub/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..4543af7b140c83857136ab57dd3453d9d38a5158 --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/kernel/sched/stats.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (static_branch_likely(&psi_disabled)) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (static_branch_likely(&psi_disabled)) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + +#ifdef CONFIG_SCHED_INFO +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. + */ +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (sched_info_on()) { + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + } + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(rq, delta); +} + +/* + * Called when a task finally hits the CPU. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct rq *rq, struct task_struct *t, struct task_struct *prev) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + rq_sched_info_arrive(rq, delta); + sli_schedlat_rundelay(t, prev, delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +{ + if (sched_info_on()) { + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); + } +} + +/* + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; + + rq_sched_info_depart(rq, delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(rq, t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + /* + * prev now departs the CPU. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(rq, prev); + + if (next != rq->idle) + sched_info_arrive(rq, next, prev); +} + +static inline void +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + if (sched_info_on()) + __sched_info_switch(rq, prev, next); +} + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, t, prev) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) +#endif /* CONFIG_SCHED_INFO */ diff --git a/ops/os_stat/os_stat/include_pub/mm/slab.h b/ops/os_stat/os_stat/include_pub/mm/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..b2b01694dc43f63259fa3ee1e1b59ef3505f794b --- /dev/null +++ b/ops/os_stat/os_stat/include_pub/mm/slab.h @@ -0,0 +1,694 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + slab_flags_t flags; /* Active flags on the slab */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#else /* !CONFIG_SLOB */ + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + +/* + * This is the main placeholder for memcg-related information in kmem caches. + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. To allow the + * array to be accessed without taking any locks, on relocation we free the old + * version only after a grace period. + * + * Root and child caches hold different metadata. + * + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. + * + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. + */ +struct memcg_cache_params { + struct kmem_cache *root_cache; + union { + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + bool dying; + }; + struct { + struct mem_cgroup *memcg; + struct list_head children_node; + struct list_head kmem_caches_node; + struct percpu_ref refcnt; + + void (*work_fn)(struct kmem_cache *); + union { + struct rcu_head rcu_head; + struct work_struct work; + }; + }; + }; +}; +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned int size; +} kmalloc_info[]; + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(slab_flags_t); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); + +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); +extern void create_boot_cache(struct kmem_cache *, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(unsigned size, unsigned align, + slab_flags_t flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)); + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ return NULL; } + +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT) + +bool __kmem_cache_empty(struct kmem_cache *); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); +void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); +void slab_kmem_cache_release(struct kmem_cache *); +void kmem_cache_shrink_all(struct kmem_cache *s); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the object listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + +static inline int cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; +} + +#ifdef CONFIG_MEMCG_KMEM + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params.root_cache; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return p == s || p == s->memcg_params.root_cache; +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + s = s->memcg_params.root_cache; + return s->name; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params.root_cache; +} + +/* + * Expects a pointer to a slab page. Please note, that PageSlab() check + * isn't sufficient, as it returns true also for tail compound slab pages, + * which do not have slab_cache pointer set. + * So this function assumes that the page can pass PageSlab() && !PageTail() + * check. + * + * The kmem_cache can be reparented asynchronously. The caller must ensure + * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex. + */ +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + struct kmem_cache *s; + + s = READ_ONCE(page->slab_cache); + if (s && !is_root_cache(s)) + return READ_ONCE(s->memcg_params.memcg); + + return NULL; +} + +/* + * Charge the slab page belonging to the non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + int ret; + + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + while (memcg && !css_tryget_online(&memcg->css)) + memcg = parent_mem_cgroup(memcg); + rcu_read_unlock(); + + if (unlikely(!memcg || mem_cgroup_is_root(memcg))) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + (1 << order)); + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + return 0; + } + + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (ret) + goto out; + + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); + + /* transer try_charge() page references to kmem_cache */ + percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order); + css_put_many(&memcg->css, 1 << order); +out: + css_put(&memcg->css); + return ret; +} + +/* + * Uncharge a slab page belonging to a non-root kmem_cache. + * Can be called for non-root kmem_caches only. + */ +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = READ_ONCE(s->memcg_params.memcg); + if (likely(!mem_cgroup_is_root(memcg))) { + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); + memcg_kmem_uncharge_memcg(page, order, memcg); + } else { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + } + rcu_read_unlock(); + + percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order); +} + +extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg); + +#else /* CONFIG_MEMCG_KMEM */ + +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return s == p; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} + +static inline struct mem_cgroup *memcg_from_slab_page(struct page *page) +{ + return NULL; +} + +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} + +static inline void memcg_link_cache(struct kmem_cache *s, + struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_MEMCG_KMEM */ + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page; + + page = virt_to_head_page(obj); + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return page->slab_cache; +} + +static __always_inline int charge_slab_page(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + 1 << order); + return 0; + } + + return memcg_charge_slab(page, gfp, order, s); +} + +static __always_inline void uncharge_slab_page(struct page *page, int order, + struct kmem_cache *s) +{ + if (is_root_cache(s)) { + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), + -(1 << order)); + return; + } + + memcg_uncharge_slab(page, order, s); +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && + !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); + return cachep; +} + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + if (memcg_kmem_enabled() && + ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) + return memcg_kmem_get_cache(s); + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + } + + if (memcg_kmem_enabled()) + memcg_kmem_put_cache(s); +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +void *slab_start(struct seq_file *m, loff_t *pos); +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_unlikely(&init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + +#endif /* MM_SLAB_H */ diff --git a/ops/os_stat/os_stat/include_tk2/arch/x86/include/asm/syscall.h b/ops/os_stat/os_stat/include_tk2/arch/x86/include/asm/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..2e188d68397c82930bdba80c52a87260546fcbd3 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/arch/x86/include/asm/syscall.h @@ -0,0 +1,244 @@ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include +#include +#include +#include /* For NR_syscalls */ +#include /* for TS_COMPAT */ +#include + +extern const unsigned long sys_call_table[]; + +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task_thread_info(task)->status & TS_COMPAT) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(args, ®s->bx + i, n * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(®s->bx + i, args, n * sizeof(args[0])); +} + +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ + return AUDIT_ARCH_I386; +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task_thread_info(task)->status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->bx; + case 1: + if (!n--) break; + *args++ = regs->cx; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->si; + case 4: + if (!n--) break; + *args++ = regs->di; + case 5: + if (!n--) break; + *args++ = regs->bp; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->di; + case 1: + if (!n--) break; + *args++ = regs->si; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->r10; + case 4: + if (!n--) break; + *args++ = regs->r8; + case 5: + if (!n--) break; + *args++ = regs->r9; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task_thread_info(task)->status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + regs->bx = *args++; + case 1: + if (!n--) break; + regs->cx = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->si = *args++; + case 4: + if (!n--) break; + regs->di = *args++; + case 5: + if (!n--) break; + regs->bp = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + regs->di = *args++; + case 1: + if (!n--) break; + regs->si = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->r10 = *args++; + case 4: + if (!n--) break; + regs->r8 = *args++; + case 5: + if (!n--) break; + regs->r9 = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entry and then + * remains set until we return to user mode. + * + * TIF_IA32 tasks should always have TS_COMPAT set at + * system call time. + * + * x32 tasks should be considered AUDIT_ARCH_X86_64. + */ + if (task_thread_info(task)->status & TS_COMPAT) + return AUDIT_ARCH_I386; +#endif + /* Both x32 and x86_64 are considered "64-bit". */ + return AUDIT_ARCH_X86_64; +} +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/ops/os_stat/os_stat/include_tk2/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_tk2/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..d7772c167685fecc89caf699884198b9a9d9f999 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/drivers/target/target_core_file.h @@ -0,0 +1,45 @@ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_tk2/fs/ext4_new/ext4.h b/ops/os_stat/os_stat/include_tk2/fs/ext4_new/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..07d8642b03677b1f66b909e2e3a53fffb5341942 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/fs/ext4_new/ext4.h @@ -0,0 +1,2821 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_UNINIT (1 << BH_Uninit) +/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of + * ext4_map_blocks wants to know whether or not the underlying cluster has + * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that + * the requested mapping was from previously mapped (or delayed allocated) + * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster + * should never appear on buffer_head's state flags. + */ +#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_DIRECT 0x0004 + +/* + * For converting uninitialized extents on a work queue. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* unwritten or not */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ +} ext4_io_end_t; + +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ECOMPR = 11, /* Compression error */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Do not take i_data_sem locking in ext4_map_blocks */ +#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RESERVE 0x0040 + +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + u32 extra = sizeof(time->tv_sec) > 4 ? + ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (unlikely(sizeof(time->tv_sec) > 4 && + (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + /* Handle legacy encoding of pre-1970 dates with epoch + * bits 1,1. We assume that by kernel version 4.20, + * everyone will have run fsck over the affected + * filesystems to correct the problem. (This + * backwards compatibility may be removed before this + * time, at the discretion of the ext4 developers.) + */ + u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; + if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) + extra_bits = 0; + time->tv_sec += extra_bits << 32; +#else + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; +#endif + } + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + struct work_struct i_unwritten_work; /* deferred extent conversion */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __le16 s_reserved_pad; + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_reserved[108]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_USR_QUOTA_INO || + ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) +{ + return inode->i_private; +} + +static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) +{ + inode->i_private = io; +} + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh); +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +extern unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); +extern int ext4_flush_unwritten_io(struct inode *); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, int handle_type, + unsigned int line_no, int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + +/* inode.c */ +struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern int ext4_calculate_overhead(struct super_block *sb); +extern int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern __printf(5, 6) +void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) +extern __printf(3, 4) +void ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ + __LINE__, msg) +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); +extern void ext4_unwritten_wait(struct inode *inode); + +/* inline.c */ +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); +void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_add_complete_io(ext4_io_end_t *io_end); +extern void ext4_exit_pageio(void); +extern void ext4_ioend_shutdown(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern void ext4_end_io_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); +extern int ext4_mmp_csum_verify(struct super_block *sb, + struct mmp_struct *mmp); + +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, + BH_AllocFromCluster, /* allocated blocks were part of already + * allocated cluster. Note that this flag will + * never, ever appear in a buffer_head's state + * flag. See EXT4_MAP_FROM_CLUSTER to see where + * this is used. */ +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +#endif /* __KERNEL__ */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_tk2/fs/ext4_new/extents_status.h b/ops/os_stat/os_stat/include_tk2/fs/ext4_new/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..ac151c1eb780b1bbe495f6b3abce917a621af446 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/fs/ext4_new/extents_status.h @@ -0,0 +1,140 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +#define EXTENT_STATUS_WRITTEN (1ULL << 63) +#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) +#define EXTENT_STATUS_DELAYED (1ULL << 61) +#define EXTENT_STATUS_HOLE (1ULL << 60) + +#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; +} + +static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_FLAGS); +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return (es->es_pblk & ~EXTENT_STATUS_FLAGS); +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~EXTENT_STATUS_FLAGS) | + (es->es_pblk & EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned long long status) +{ + ext4_fsblk_t block; + + block = (status & EXTENT_STATUS_FLAGS) | + (es->es_pblk & ~EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_list_add(struct inode *inode); +extern void ext4_es_list_del(struct inode *inode); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_log_priv.h b/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_log_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..51bf7b827387198d3bbc9f6db146abfcd3056a0c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_log_priv.h @@ -0,0 +1,619 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; +struct xfs_log_callback; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct xlog *ic_log; + int ic_size; + int ic_offset; + int ic_bwritecnt; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; +#endif + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = ACCESS_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_trans_priv.h b/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_trans_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..b317a3644c006817d7c6a2ccc3b0e08cc59e2a3f --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + bool abort); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; + struct list_head xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); + +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk2/include/generated/asm-offsets.h b/ops/os_stat/os_stat/include_tk2/include/generated/asm-offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..42dd77f36ce8b43ca2cf478af6dc2ddb1f5348f2 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/include/generated/asm-offsets.h @@ -0,0 +1,98 @@ +#ifndef __ASM_OFFSETS_H__ +#define __ASM_OFFSETS_H__ +/* + * DO NOT MODIFY. + * + * This file was generated by Kbuild + */ + +#define PV_CPU_usergs_sysret64 232 /* offsetof(struct pv_cpu_ops, usergs_sysret64) */ +#define PV_CPU_swapgs 248 /* offsetof(struct pv_cpu_ops, swapgs) */ + +#define KVM_STEAL_TIME_preempted 16 /* offsetof(struct kvm_steal_time, preempted) */ + +#define pt_regs_bx 40 /* offsetof(struct pt_regs, bx) */ +#define pt_regs_cx 88 /* offsetof(struct pt_regs, cx) */ +#define pt_regs_dx 96 /* offsetof(struct pt_regs, dx) */ +#define pt_regs_sp 152 /* offsetof(struct pt_regs, sp) */ +#define pt_regs_bp 32 /* offsetof(struct pt_regs, bp) */ +#define pt_regs_si 104 /* offsetof(struct pt_regs, si) */ +#define pt_regs_di 112 /* offsetof(struct pt_regs, di) */ +#define pt_regs_r8 72 /* offsetof(struct pt_regs, r8) */ +#define pt_regs_r9 64 /* offsetof(struct pt_regs, r9) */ +#define pt_regs_r10 56 /* offsetof(struct pt_regs, r10) */ +#define pt_regs_r11 48 /* offsetof(struct pt_regs, r11) */ +#define pt_regs_r12 24 /* offsetof(struct pt_regs, r12) */ +#define pt_regs_r13 16 /* offsetof(struct pt_regs, r13) */ +#define pt_regs_r14 8 /* offsetof(struct pt_regs, r14) */ +#define pt_regs_r15 0 /* offsetof(struct pt_regs, r15) */ +#define pt_regs_flags 144 /* offsetof(struct pt_regs, flags) */ + +#define saved_context_cr0 202 /* offsetof(struct saved_context, cr0) */ +#define saved_context_cr2 210 /* offsetof(struct saved_context, cr2) */ +#define saved_context_cr3 218 /* offsetof(struct saved_context, cr3) */ +#define saved_context_cr4 226 /* offsetof(struct saved_context, cr4) */ +#define saved_context_cr8 234 /* offsetof(struct saved_context, cr8) */ +#define saved_context_gdt_desc 277 /* offsetof(struct saved_context, gdt_desc) */ + +#define TSS_ist 36 /* offsetof(struct tss_struct, x86_tss.ist) */ +#define TSS_sp0 4 /* offsetof(struct tss_struct, x86_tss.sp0) */ +#define TSS_sp1 12 /* offsetof(struct tss_struct, x86_tss.sp1) */ + +#define stack_canary_offset 40 /* offsetof(union irq_stack_union, stack_canary) */ + +#define __NR_syscall_max 332 /* sizeof(syscalls_64) - 1 */ +#define NR_syscalls 333 /* sizeof(syscalls_64) */ +#define __NR_syscall_compat_max 384 /* sizeof(syscalls_ia32) - 1 */ +#define IA32_NR_syscalls 385 /* sizeof(syscalls_ia32) */ + +#define TASK_threadsp 9176 /* offsetof(struct task_struct, thread.sp) */ +#define TASK_stack_canary 2808 /* offsetof(struct task_struct, stack_canary) */ + +#define TASK_TI_flags 0 /* offsetof(struct task_struct, thread_info.flags) */ +#define TASK_addr_limit 9304 /* offsetof(struct task_struct, thread.addr_limit) */ + +#define crypto_tfm_ctx_offset 64 /* offsetof(struct crypto_tfm, __crt_ctx) */ + +#define pbe_address 0 /* offsetof(struct pbe, address) */ +#define pbe_orig_address 8 /* offsetof(struct pbe, orig_address) */ +#define pbe_next 16 /* offsetof(struct pbe, next) */ + +#define IA32_SIGCONTEXT_ax 44 /* offsetof(struct sigcontext_32, ax) */ +#define IA32_SIGCONTEXT_bx 32 /* offsetof(struct sigcontext_32, bx) */ +#define IA32_SIGCONTEXT_cx 40 /* offsetof(struct sigcontext_32, cx) */ +#define IA32_SIGCONTEXT_dx 36 /* offsetof(struct sigcontext_32, dx) */ +#define IA32_SIGCONTEXT_si 20 /* offsetof(struct sigcontext_32, si) */ +#define IA32_SIGCONTEXT_di 16 /* offsetof(struct sigcontext_32, di) */ +#define IA32_SIGCONTEXT_bp 24 /* offsetof(struct sigcontext_32, bp) */ +#define IA32_SIGCONTEXT_sp 28 /* offsetof(struct sigcontext_32, sp) */ +#define IA32_SIGCONTEXT_ip 56 /* offsetof(struct sigcontext_32, ip) */ + +#define IA32_RT_SIGFRAME_sigcontext 164 /* offsetof(struct rt_sigframe_ia32, uc.uc_mcontext) */ + +#define PARAVIRT_PATCH_pv_cpu_ops 24 /* offsetof(struct paravirt_patch_template, pv_cpu_ops) */ +#define PARAVIRT_PATCH_pv_irq_ops 296 /* offsetof(struct paravirt_patch_template, pv_irq_ops) */ +#define PV_IRQ_irq_disable 16 /* offsetof(struct pv_irq_ops, irq_disable) */ +#define PV_IRQ_irq_enable 24 /* offsetof(struct pv_irq_ops, irq_enable) */ +#define PV_CPU_iret 240 /* offsetof(struct pv_cpu_ops, iret) */ +#define PV_CPU_read_cr0 16 /* offsetof(struct pv_cpu_ops, read_cr0) */ +#define PV_MMU_read_cr2 0 /* offsetof(struct pv_mmu_ops, read_cr2) */ + +#define BP_scratch 484 /* offsetof(struct boot_params, scratch) */ +#define BP_secure_boot 492 /* offsetof(struct boot_params, secure_boot) */ +#define BP_loadflags 529 /* offsetof(struct boot_params, hdr.loadflags) */ +#define BP_hardware_subarch 572 /* offsetof(struct boot_params, hdr.hardware_subarch) */ +#define BP_version 518 /* offsetof(struct boot_params, hdr.version) */ +#define BP_kernel_alignment 560 /* offsetof(struct boot_params, hdr.kernel_alignment) */ +#define BP_init_size 608 /* offsetof(struct boot_params, hdr.init_size) */ +#define BP_pref_address 600 /* offsetof(struct boot_params, hdr.pref_address) */ +#define BP_code32_start 532 /* offsetof(struct boot_params, hdr.code32_start) */ + +#define PTREGS_SIZE 168 /* sizeof(struct pt_regs) */ +#define TLB_STATE_user_pcid_flush_mask 22 /* offsetof(struct tlb_state, user_pcid_flush_mask) */ +#define CPU_ENTRY_AREA_tss 8192 /* offsetof(struct cpu_entry_area, tss) */ +#define CPU_ENTRY_AREA_entry_trampoline 20480 /* offsetof(struct cpu_entry_area, entry_trampoline) */ +#define CPU_ENTRY_AREA_entry_stack 4096 /* offsetof(struct cpu_entry_area, entry_stack_page) */ +#define SIZEOF_entry_stack 512 /* sizeof(struct entry_stack) */ + +#endif diff --git a/ops/os_stat/os_stat/include_tk2/include/linux/nospec.h b/ops/os_stat/os_stat/include_tk2/include/linux/nospec.h new file mode 100644 index 0000000000000000000000000000000000000000..0c5ef54fd4162830b55aa676c1ecae4ea6ac23f5 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/include/linux/nospec.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H +#include + +struct task_struct; + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + (typeof(_i)) (_i & _mask); \ +}) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + +#endif /* _LINUX_NOSPEC_H */ diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/auto_group.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/auto_group.h new file mode 100644 index 0000000000000000000000000000000000000000..8bd047142816dea81894bb27ccc3c78a38ac3d61 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/auto_group.h @@ -0,0 +1,64 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include +#include + +struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/cpuacct.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/cpuacct.h new file mode 100644 index 0000000000000000000000000000000000000000..2798bb1f8889b0bbd65ab85724844f47856621fc --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/cpuacct.h @@ -0,0 +1,20 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void bt_cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} +static inline void bt_cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..f6d756173491c87e3defef7e16547cd24d75fbfa --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/cpupri.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/features.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..e75e66c8bd3d9ad2463d224366731fecdf635871 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/features.h @@ -0,0 +1,74 @@ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) + +/* + * Decrement CPU power based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_POWER, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +SCHED_FEAT(FORCE_SD_OVERLAP, false) +SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) +#endif + +SCHED_FEAT(BT_RUNTIME_SHARE, false) diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/sched.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..7cfb2f69f24a9e8dcc8613f058ccd07837b5db7e --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/sched.h @@ -0,0 +1,1617 @@ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpupri.h" +#include "cpuacct.h" + +extern __read_mostly int scheduler_running; + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MIN_BT_PRI + 1 ..MAX_BT_PRIO ], + * and back. + */ +#define NICE_TO_BT_PRIO(nice) (MAX_RT_PRIO + (nice) + 20 + 40) +#define PRIO_TO_BT_NICE(prio) ((prio) - MAX_RT_PRIO - 20 - 40) +#define TASK_BT_NICE(p) PRIO_TO_BT_NICE((p)->static_prio) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + */ + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int bt_policy(int policy) +{ + if (policy == SCHED_BT) + return 1; + return 0; +} + +static inline int task_has_bt_policy(struct task_struct *p) +{ + return bt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +extern struct mutex sched_domains_mutex; + +#ifdef CONFIG_CGROUP_SCHED + +#include + +struct cfs_rq; +struct rt_rq; +struct bt_rq; + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + u64 relax_thresh; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + atomic64_t load_avg; + atomic_t runnable_avg; +#endif +#ifdef CONFIG_BT_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_bt_entity **bt; + /* runqueue "owned" by this group on each cpu */ + struct bt_rq **bt_rq; + unsigned long bt_shares; + + atomic64_t bt_load_avg; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; + + unsigned long offline; + struct mutex offline_mutex; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif +#ifdef CONFIG_BT_GROUP_SCHED +#define ROOT_TASK_GROUP_BT_LOAD NICE_0_LOAD +#define MIN_BT_SHARES (1UL << 1) +#define MAX_BT_SHARES (1UL << 18) +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_bt_sched_group(struct task_group *tg); +extern int alloc_bt_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_bt_sched_group(struct task_group *tg); +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +extern void unregister_bt_sched_group(struct task_group *tg, int cpu); +extern void init_tg_bt_entry(struct task_group *tg, struct bt_rq *bt_rq, + struct sched_bt_entity *se, int cpu, + struct sched_bt_entity *parent); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif +#ifdef CONFIG_BT_GROUP_SCHED +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +#endif + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter, removed_load; + u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib; + u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +struct bt_bandwidth { + raw_spinlock_t bt_runtime_lock; + ktime_t bt_period; + u64 bt_runtime; + struct hrtimer bt_period_timer; + int timer_active; +}; + +static inline int bt_bandwidth_enabled(void) +{ + return sysctl_sched_bt_runtime >= 0; +} + + +/* Batch-related fields in a runqueue */ +struct bt_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + unsigned long nr_uninterruptible; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + /* + * 'curr' points to currently running entity on this bt_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_bt_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_BT_GROUP_SCHED + /* + * BT Load tracking + */ + struct sched_avg_bt avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; + + + unsigned long tg_load_avg_contrib; +#endif /* CONFIG_BT_GROUP_SCHED */ + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; +#endif /* CONFIG_SMP */ +#ifdef CONFIG_BT_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_bt_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ +#endif /* CONFIG_BT_GROUP_SCHED */ + + int bt_throttled; + u64 bt_time; + u64 bt_runtime; + + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; + + + /* Nests inside the rq lock: */ + raw_spinlock_t bt_runtime_lock; +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + + /* Indicate more than one runnable task for any CPU */ + bool overload; + bool overload_bt; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +extern struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; + unsigned int bt_nr_running; + u64 bt_blocked_clock; + + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; + + unsigned long cpu_bt_load[CPU_LOAD_IDX_MAX]; + unsigned long last_bt_load_update_tick; + unsigned long do_lb; + +#ifdef CONFIG_NO_HZ_COMMON + u64 nohz_stamp; + unsigned long nohz_flags; +#endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct load_weight bt_load; + unsigned long nr_bt_load_updates; + + struct cfs_rq cfs; + struct rt_rq rt; + struct bt_rq bt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_BT_GROUP_SCHED + struct list_head leaf_bt_rq_list; +#ifdef CONFIG_SMP + unsigned long h_bt_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_BT_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + unsigned long next_balance_bt; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_balance; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + int active_balance_bt; + int push_cpu_bt; + struct cpu_stop_work active_bt_balance_work; + + /* cpu of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; + struct list_head bt_tasks; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; + u64 idle_bt_stamp; + u64 avg_idle_bt; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +//DECLARE_PER_CPU(struct rq, runqueues); + +#define cpu_rq(cpu) (per_cpu(runqueues, (cpu))) +#define this_rq() (__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +//#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(runqueues) + +#ifdef CONFIG_SMP + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +//#define for_each_domain(cpu, __sd) \ +// for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ +// __sd; __sd = __sd->parent) + +//#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The cpu whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + return hsd; +} + +DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_id); + +struct sched_group_power { + atomic_t ref; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. + */ + unsigned int power, power_orig, power_bt; + unsigned long next_update; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + int bt_balance_cpu; + unsigned int group_weight; + struct sched_group_power *sgp; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "auto_group.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) || \ + defined(CONFIG_BT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif +#ifdef CONFIG_BT_GROUP_SCHED + p->bt.bt_rq = tg->bt_rq[cpu]; + p->bt.parent = tg->bt[cpu]; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +static __always_inline bool static_branch__true(struct static_key *key) +{ + return static_key_true(key); /* Not out of line branch. */ +} + +static __always_inline bool static_branch__false(struct static_key *key) +{ + return static_key_false(key); /* Out of line branch. */ +} + +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_branch__##enabled(key); \ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +static inline u64 global_bt_period(void) +{ + return (u64)sysctl_sched_bt_period * NSEC_PER_USEC; +} + +static inline u64 global_bt_runtime(void) +{ + if (sysctl_sched_bt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_bt_runtime * NSEC_PER_USEC; +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif + raw_spin_unlock(&rq->lock); +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif + local_irq_enable(); +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif + +#define DEQUEUE_SLEEP 1 + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_waking) (struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_BT_GROUP_SCHED) + void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +}; + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class bt_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void update_group_power(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq, int cpu); +extern void idle_balance(int this_cpu, struct rq *this_rq); + +extern void idle_balance_bt(int this_cpu, struct rq *this_rq); + +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif +#if defined(CONFIG_BT_GROUP_SCHED) +extern void idle_enter_bt(struct rq *this_rq); +extern void idle_exit_bt(struct rq *this_rq); +#else +static inline void idle_enter_bt(struct rq *this_rq) {} +static inline void idle_exit_bt(struct rq *this_rq) {} +#endif + +#else /* CONFIG_SMP */ + +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); +extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); +extern void init_sched_bt_class(void); + +extern void resched_task(struct task_struct *p); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern void update_idle_cpu_load(struct rq *this_rq); + +extern int update_bt_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); + +extern struct bt_bandwidth def_bt_bandwidth; +extern void init_bt_bandwidth(struct bt_bandwidth *bt_b, u64 period, u64 runtime); +extern void update_idle_cpu_bt_load(struct rq *this_rq); +extern void init_bt_entity_runnable_average(struct sched_bt_entity *se); +extern void post_init_bt_entity_util_avg(struct sched_bt_entity *se); +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static inline void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; + +#ifdef CONFIG_SMP + if (!rq->rd->overload && (rq->nr_running - rq->bt_nr_running == 2)) + rq->rd->overload = true; +#endif + + if (rq->nr_running == 2) { +#ifdef CONFIG_SMP + if (rq->bt_nr_running && !rq->rd->overload_bt) + rq->rd->overload_bt = true; +#endif +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } +#endif + } +} + +static inline void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_bt_entity *__pick_first_bt_entity(struct bt_rq *bt_rq); +extern struct sched_bt_entity *__pick_last_bt_entity(struct bt_rq *bt_rq); +extern void print_bt_stats(struct seq_file *m, int cpu); + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_bt_rq(struct bt_rq *bt_rq); +extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +enum rq_nohz_flag_bits { + NOHZ_TICK_STOPPED, + NOHZ_BALANCE_KICK, +}; + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} diff --git a/ops/os_stat/os_stat/include_tk2/kernel/sched/stats.h b/ops/os_stat/os_stat/include_tk2/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..2ef90a51ec5e3ad6ccff9e8f44fde752d204dcda --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/kernel/sched/stats.h @@ -0,0 +1,231 @@ + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val) do { var = (val); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + + rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = task_rq(t)->clock; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct task_struct *t) +{ + unsigned long long delta = task_rq(t)->clock - + t->sched_info.last_arrival; + + rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + struct rq *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.utime += cputime; + raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.stime += cputime; + raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + raw_spin_unlock(&cputimer->lock); +} diff --git a/ops/os_stat/os_stat/include_tk2/mm/slab.h b/ops/os_stat/os_stat/include_tk2/mm/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..4d6d836247dd95c3eb5e4d3e39295a80eb000a81 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk2/mm/slab.h @@ -0,0 +1,275 @@ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void create_kmalloc_caches(unsigned long); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +struct mem_cgroup; +#ifdef CONFIG_SLUB +struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ return NULL; } +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +int __kmem_cache_shutdown(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return (is_root_cache(cachep) && !memcg) || + (cachep->memcg_params->memcg == memcg); +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + if (!s->memcg_params) + return NULL; + return s->memcg_params->memcg_caches[idx]; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool cache_match_memcg(struct kmem_cache *cachep, + struct mem_cgroup *memcg) +{ + return true; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} +#endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} +#endif + + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; diff --git a/ops/os_stat/os_stat/include_tk3/arch/x86/include/asm/syscall.h b/ops/os_stat/os_stat/include_tk3/arch/x86/include/asm/syscall.h new file mode 100644 index 0000000000000000000000000000000000000000..03eedc21246d5b65d39ffb45ce27c588151d7488 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/arch/x86/include/asm/syscall.h @@ -0,0 +1,242 @@ +/* + * Access to user system call parameters and results + * + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * See asm-generic/syscall.h for descriptions of what we must do here. + */ + +#ifndef _ASM_X86_SYSCALL_H +#define _ASM_X86_SYSCALL_H + +#include +#include +#include +#include /* For NR_syscalls */ +#include /* for TS_COMPAT */ +#include + +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); +extern const sys_call_ptr_t sys_call_table[]; + +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) +{ + return regs->orig_ax; +} + +static inline void syscall_rollback(struct task_struct *task, + struct pt_regs *regs) +{ + regs->ax = regs->orig_ax; +} + +static inline long syscall_get_error(struct task_struct *task, + struct pt_regs *regs) +{ + unsigned long error = regs->ax; +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. + */ + error = (long) (int) error; +#endif + return IS_ERR_VALUE(error) ? error : 0; +} + +static inline long syscall_get_return_value(struct task_struct *task, + struct pt_regs *regs) +{ + return regs->ax; +} + +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->ax = (long) error ?: val; +} + +#ifdef CONFIG_X86_32 + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(args, ®s->bx + i, n * sizeof(args[0])); +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ + BUG_ON(i + n > 6); + memcpy(®s->bx + i, args, n * sizeof(args[0])); +} + +static inline int syscall_get_arch(void) +{ + return AUDIT_ARCH_I386; +} + +#else /* CONFIG_X86_64 */ + +static inline void syscall_get_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->bx; + case 1: + if (!n--) break; + *args++ = regs->cx; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->si; + case 4: + if (!n--) break; + *args++ = regs->di; + case 5: + if (!n--) break; + *args++ = regs->bp; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + *args++ = regs->di; + case 1: + if (!n--) break; + *args++ = regs->si; + case 2: + if (!n--) break; + *args++ = regs->dx; + case 3: + if (!n--) break; + *args++ = regs->r10; + case 4: + if (!n--) break; + *args++ = regs->r8; + case 5: + if (!n--) break; + *args++ = regs->r9; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned int i, unsigned int n, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; + regs->bx = *args++; + case 1: + if (!n--) break; + regs->cx = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->si = *args++; + case 4: + if (!n--) break; + regs->di = *args++; + case 5: + if (!n--) break; + regs->bp = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } + else +# endif + switch (i) { + case 0: + if (!n--) break; + regs->di = *args++; + case 1: + if (!n--) break; + regs->si = *args++; + case 2: + if (!n--) break; + regs->dx = *args++; + case 3: + if (!n--) break; + regs->r10 = *args++; + case 4: + if (!n--) break; + regs->r8 = *args++; + case 5: + if (!n--) break; + regs->r9 = *args++; + case 6: + if (!n--) break; + default: + BUG(); + break; + } +} + +static inline int syscall_get_arch(void) +{ + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; +} +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SYSCALL_H */ diff --git a/ops/os_stat/os_stat/include_tk3/drivers/block/loop.h b/ops/os_stat/os_stat/include_tk3/drivers/block/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..b2251752452bc75ebc02049b1395c1a072981a77 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/drivers/block/loop.h @@ -0,0 +1,94 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + atomic_t lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct kthread_work work; + struct request *rq; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/ops/os_stat/os_stat/include_tk3/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_tk3/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..53be5ffd32613392b16e736e45f1e63380d16e4c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/drivers/target/target_core_file.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#include + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_MAX_DEV_PROT_NAME FD_MAX_DEV_NAME + 16 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 +#define FDBD_FORMAT_UNIT_SIZE 2048 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + struct file *fd_prot_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_tk3/fs/ext4_new/ext4.h b/ops/os_stat/os_stat/include_tk3/fs/ext4_new/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..8d2f8f2d742559a8120c742f528cb5a605ccba42 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/fs/ext4_new/ext4.h @@ -0,0 +1,3280 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#include +#else +#include +#endif +#include +#include +#ifdef __KERNEL__ +#include +#endif + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY + +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + u32 extra = sizeof(time->tv_sec) > 4 ? + ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (unlikely(sizeof(time->tv_sec) > 4 && + (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { + +#if 1 + /* Handle legacy encoding of pre-1970 dates with epoch + * bits 1,1. (This backwards compatibility may be removed + * at the discretion of the ext4 developers.) + */ + u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; + if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) + extra_bits = 0; + time->tv_sec += extra_bits << 32; +#else + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; +#endif + } + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __le32 s_reserved[98]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; + struct dax_device *s_daxdev; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +/* + * Returns true if the inode is inode is encrypted + */ +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +static inline bool ext4_encrypted_inode(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); +} + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + memset(fname, 0, sizeof(struct ext4_filename)); + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + + fname->usr_fname = name.usr_fname; + fname->disk_name = name.disk_name; + fname->hinfo.hash = name.hash; + fname->hinfo.minor_hash = name.minor_hash; + fname->crypto_buf = name.crypto_buf; + return err; +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; +} +#else +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } + +#endif + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern const struct file_operations ext4_seq_mb_groups_fops; +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern int ext4_page_mkwrite(struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_tk3/fs/ext4_new/extents_status.h b/ops/os_stat/os_stat/include_tk3/fs/ext4_new/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..22a1883491572ffb905becb3b4083c743e847a0f --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/fs/ext4_new/extents_status.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_log_priv.h b/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_log_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..51bf7b827387198d3bbc9f6db146abfcd3056a0c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_log_priv.h @@ -0,0 +1,619 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; +struct xfs_log_callback; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct xlog *ic_log; + int ic_size; + int ic_offset; + int ic_bwritecnt; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; +#endif + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern int +xlog_recover_cancel(struct xlog *); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = ACCESS_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_trans_priv.h b/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_trans_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..b317a3644c006817d7c6a2ccc3b0e08cc59e2a3f --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + bool abort); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; + struct list_head xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); + +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->xa_lock); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk3/include/generated/asm-offsets.h b/ops/os_stat/os_stat/include_tk3/include/generated/asm-offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..42dd77f36ce8b43ca2cf478af6dc2ddb1f5348f2 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/include/generated/asm-offsets.h @@ -0,0 +1,98 @@ +#ifndef __ASM_OFFSETS_H__ +#define __ASM_OFFSETS_H__ +/* + * DO NOT MODIFY. + * + * This file was generated by Kbuild + */ + +#define PV_CPU_usergs_sysret64 232 /* offsetof(struct pv_cpu_ops, usergs_sysret64) */ +#define PV_CPU_swapgs 248 /* offsetof(struct pv_cpu_ops, swapgs) */ + +#define KVM_STEAL_TIME_preempted 16 /* offsetof(struct kvm_steal_time, preempted) */ + +#define pt_regs_bx 40 /* offsetof(struct pt_regs, bx) */ +#define pt_regs_cx 88 /* offsetof(struct pt_regs, cx) */ +#define pt_regs_dx 96 /* offsetof(struct pt_regs, dx) */ +#define pt_regs_sp 152 /* offsetof(struct pt_regs, sp) */ +#define pt_regs_bp 32 /* offsetof(struct pt_regs, bp) */ +#define pt_regs_si 104 /* offsetof(struct pt_regs, si) */ +#define pt_regs_di 112 /* offsetof(struct pt_regs, di) */ +#define pt_regs_r8 72 /* offsetof(struct pt_regs, r8) */ +#define pt_regs_r9 64 /* offsetof(struct pt_regs, r9) */ +#define pt_regs_r10 56 /* offsetof(struct pt_regs, r10) */ +#define pt_regs_r11 48 /* offsetof(struct pt_regs, r11) */ +#define pt_regs_r12 24 /* offsetof(struct pt_regs, r12) */ +#define pt_regs_r13 16 /* offsetof(struct pt_regs, r13) */ +#define pt_regs_r14 8 /* offsetof(struct pt_regs, r14) */ +#define pt_regs_r15 0 /* offsetof(struct pt_regs, r15) */ +#define pt_regs_flags 144 /* offsetof(struct pt_regs, flags) */ + +#define saved_context_cr0 202 /* offsetof(struct saved_context, cr0) */ +#define saved_context_cr2 210 /* offsetof(struct saved_context, cr2) */ +#define saved_context_cr3 218 /* offsetof(struct saved_context, cr3) */ +#define saved_context_cr4 226 /* offsetof(struct saved_context, cr4) */ +#define saved_context_cr8 234 /* offsetof(struct saved_context, cr8) */ +#define saved_context_gdt_desc 277 /* offsetof(struct saved_context, gdt_desc) */ + +#define TSS_ist 36 /* offsetof(struct tss_struct, x86_tss.ist) */ +#define TSS_sp0 4 /* offsetof(struct tss_struct, x86_tss.sp0) */ +#define TSS_sp1 12 /* offsetof(struct tss_struct, x86_tss.sp1) */ + +#define stack_canary_offset 40 /* offsetof(union irq_stack_union, stack_canary) */ + +#define __NR_syscall_max 332 /* sizeof(syscalls_64) - 1 */ +#define NR_syscalls 333 /* sizeof(syscalls_64) */ +#define __NR_syscall_compat_max 384 /* sizeof(syscalls_ia32) - 1 */ +#define IA32_NR_syscalls 385 /* sizeof(syscalls_ia32) */ + +#define TASK_threadsp 9176 /* offsetof(struct task_struct, thread.sp) */ +#define TASK_stack_canary 2808 /* offsetof(struct task_struct, stack_canary) */ + +#define TASK_TI_flags 0 /* offsetof(struct task_struct, thread_info.flags) */ +#define TASK_addr_limit 9304 /* offsetof(struct task_struct, thread.addr_limit) */ + +#define crypto_tfm_ctx_offset 64 /* offsetof(struct crypto_tfm, __crt_ctx) */ + +#define pbe_address 0 /* offsetof(struct pbe, address) */ +#define pbe_orig_address 8 /* offsetof(struct pbe, orig_address) */ +#define pbe_next 16 /* offsetof(struct pbe, next) */ + +#define IA32_SIGCONTEXT_ax 44 /* offsetof(struct sigcontext_32, ax) */ +#define IA32_SIGCONTEXT_bx 32 /* offsetof(struct sigcontext_32, bx) */ +#define IA32_SIGCONTEXT_cx 40 /* offsetof(struct sigcontext_32, cx) */ +#define IA32_SIGCONTEXT_dx 36 /* offsetof(struct sigcontext_32, dx) */ +#define IA32_SIGCONTEXT_si 20 /* offsetof(struct sigcontext_32, si) */ +#define IA32_SIGCONTEXT_di 16 /* offsetof(struct sigcontext_32, di) */ +#define IA32_SIGCONTEXT_bp 24 /* offsetof(struct sigcontext_32, bp) */ +#define IA32_SIGCONTEXT_sp 28 /* offsetof(struct sigcontext_32, sp) */ +#define IA32_SIGCONTEXT_ip 56 /* offsetof(struct sigcontext_32, ip) */ + +#define IA32_RT_SIGFRAME_sigcontext 164 /* offsetof(struct rt_sigframe_ia32, uc.uc_mcontext) */ + +#define PARAVIRT_PATCH_pv_cpu_ops 24 /* offsetof(struct paravirt_patch_template, pv_cpu_ops) */ +#define PARAVIRT_PATCH_pv_irq_ops 296 /* offsetof(struct paravirt_patch_template, pv_irq_ops) */ +#define PV_IRQ_irq_disable 16 /* offsetof(struct pv_irq_ops, irq_disable) */ +#define PV_IRQ_irq_enable 24 /* offsetof(struct pv_irq_ops, irq_enable) */ +#define PV_CPU_iret 240 /* offsetof(struct pv_cpu_ops, iret) */ +#define PV_CPU_read_cr0 16 /* offsetof(struct pv_cpu_ops, read_cr0) */ +#define PV_MMU_read_cr2 0 /* offsetof(struct pv_mmu_ops, read_cr2) */ + +#define BP_scratch 484 /* offsetof(struct boot_params, scratch) */ +#define BP_secure_boot 492 /* offsetof(struct boot_params, secure_boot) */ +#define BP_loadflags 529 /* offsetof(struct boot_params, hdr.loadflags) */ +#define BP_hardware_subarch 572 /* offsetof(struct boot_params, hdr.hardware_subarch) */ +#define BP_version 518 /* offsetof(struct boot_params, hdr.version) */ +#define BP_kernel_alignment 560 /* offsetof(struct boot_params, hdr.kernel_alignment) */ +#define BP_init_size 608 /* offsetof(struct boot_params, hdr.init_size) */ +#define BP_pref_address 600 /* offsetof(struct boot_params, hdr.pref_address) */ +#define BP_code32_start 532 /* offsetof(struct boot_params, hdr.code32_start) */ + +#define PTREGS_SIZE 168 /* sizeof(struct pt_regs) */ +#define TLB_STATE_user_pcid_flush_mask 22 /* offsetof(struct tlb_state, user_pcid_flush_mask) */ +#define CPU_ENTRY_AREA_tss 8192 /* offsetof(struct cpu_entry_area, tss) */ +#define CPU_ENTRY_AREA_entry_trampoline 20480 /* offsetof(struct cpu_entry_area, entry_trampoline) */ +#define CPU_ENTRY_AREA_entry_stack 4096 /* offsetof(struct cpu_entry_area, entry_stack_page) */ +#define SIZEOF_entry_stack 512 /* sizeof(struct entry_stack) */ + +#endif diff --git a/ops/os_stat/os_stat/include_tk3/include/linux/nospec.h b/ops/os_stat/os_stat/include_tk3/include/linux/nospec.h new file mode 100644 index 0000000000000000000000000000000000000000..0c5ef54fd4162830b55aa676c1ecae4ea6ac23f5 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/include/linux/nospec.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H +#include + +struct task_struct; + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + (typeof(_i)) (_i & _mask); \ +}) + +/* Speculation control prctl */ +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which); +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl); +/* Speculation control for seccomp enforced mitigation */ +void arch_seccomp_spec_mitigate(struct task_struct *task); + +#endif /* _LINUX_NOSPEC_H */ diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/autogroup.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/autogroup.h new file mode 100644 index 0000000000000000000000000000000000000000..27cd22b8982405c5ef2f07c7c4ff50314d220f36 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/autogroup.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include +#include +#include + +struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/cpudeadline.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/cpudeadline.h new file mode 100644 index 0000000000000000000000000000000000000000..b010d26e108eb4dbf067a8116e422dc74232cf6e --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/cpudeadline.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include +#include + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); +int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); +void cpudl_cleanup(struct cpudl *cp); +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..bab05001907182815bd1e3922191cd1778deb1e1 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/cpupri.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/features.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..ff7eae0e5309f0b369f06269295ef7e51188bf1b --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/features.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) + +/* + * Decrement CPU capacity based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_CAPACITY, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + +SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) + +SCHED_FEAT(BT_RUNTIME_SHARE, false) diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/sched.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..8fcd6db6533aea54305ee6df87203e7f28817549 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/sched.h @@ -0,0 +1,2295 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PARAVIRT +#include +#endif + +#include "cpupri.h" +#include "cpudeadline.h" +//#include "cpuacct.h" +#ifdef CONFIG_BT_SCHED +//#include "batch.h" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + +struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +extern __read_mostly int scheduler_running; + +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +#ifdef CONFIG_SMP +extern void cpu_load_update_active(struct rq *this_rq); +#ifdef CONFIG_BT_SCHED +extern void update_cpu_bt_load_active(struct rq *this_rq); +#endif +#else +static inline void cpu_load_update_active(struct rq *this_rq) { } +#ifdef CONFIG_BT_SCHED +static inline void update_cpu_bt_load_active(struct rq *this_rq) { } +#endif +#endif + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are + * pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to + * increase coverage and consistency always enable it on 64bit platforms. + */ +//#ifdef CONFIG_64BIT +#if 0 +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) +#else +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) + +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + + +#ifdef CONFIG_BT_SCHED +#define RQ_CFS_NR_UNINTERRUPTIBLE(rq) \ + ((rq)->nr_uninterruptible - (rq)->bt.nr_uninterruptible) + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running - (rq)->bt_nr_running) + +#else +#define RQ_CFS_NR_UNINTERRUPTIBLE(rq) \ + ((rq)->nr_uninterruptible) + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running) + +#endif + +/* + * These are the 'tuning knobs' of the scheduler: + */ + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static inline int rt_policy(int policy) +{ + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} + +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || +#ifdef CONFIG_BT_SCHED_1 + bt_policy(policy) || +#endif + rt_policy(policy) || dl_policy(policy); +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; +}; + +void __dl_clear_params(struct task_struct *p); + +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; + +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern void __dl_clear_params(struct task_struct *p); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); + +#ifdef CONFIG_CGROUP_SCHED + +#include + +struct cfs_rq; +struct rt_rq; + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchical_quota; + + short idle, period_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; + + bool distribute_running; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#endif +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + struct sched_entity **bt; + struct bt_rq **bt_rq; + unsigned long bt_shares; + + atomic64_t bt_load_avg; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; + + unsigned long offline; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif +#ifdef CONFIG_BT_GROUP_SCHED +#define ROOT_TASK_GROUP_BT_LOAD NICE_0_LOAD +#define MIN_BT_SHARES (1UL << 1) +#define MAX_BT_SHARES (1UL << 18) +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_BT_SCHED +extern void free_bt_sched_group(struct task_group *tg); +extern int alloc_bt_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_bt_sched_group(struct task_group *tg); +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +extern void unregister_bt_sched_group(struct task_group *tg); +extern void init_tg_bt_entry(struct task_group *tg, struct bt_rq *bt_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +#endif + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +#endif + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root_cached tasks_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +#ifdef CONFIG_BT_SCHED +static inline int bt_bandwidth_enabled(void) +{ + return sysctl_sched_bt_runtime >= 0; +} +#endif + +/* RT IPI pull logic requires IRQ_WORK */ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +# define HAVE_RT_PUSH_IPI +#endif + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif +}; + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root_cached root; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root_cached pushable_dl_tasks_root; +#else + struct dl_bw dl_bw; +#endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; +}; + +#ifdef CONFIG_SMP + +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* Indicate more than one runnable task for any CPU */ + bool overload; +#ifdef CONFIG_BT_SCHED + bool overload_bt; +#endif + + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; + + unsigned long max_cpu_capacity; +}; + +extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); + +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_BT_SCHED + unsigned int bt_nr_running; + u64 bt_blocked_clock; +#endif +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; +#endif + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; +#ifdef CONFIG_BT_SCHED + unsigned long cpu_bt_load[CPU_LOAD_IDX_MAX]; +#endif +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#ifdef CONFIG_BT_SCHED + unsigned long last_bt_load_update_tick; + unsigned long do_lb; +#endif +#endif /* CONFIG_SMP */ + unsigned long nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + +#ifdef CONFIG_BT_SCHED + struct load_weight bt_load; + unsigned long nr_bt_load_updates; +#endif + + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; +#ifdef CONFIG_BT_SCHED_1 + struct bt_rq bt; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED + struct list_head leaf_bt_rq_list; +#ifdef CONFIG_SMP + unsigned long h_bt_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_BT_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; +#ifdef CONFIG_BT_SCHED + unsigned long next_balance_bt; +#endif + struct mm_struct *prev_mm; + + unsigned int clock_update_flags; + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; + + unsigned char idle_balance; + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + +#ifdef CONFIG_BT_SCHED + int active_balance_bt; + int push_cpu_bt; + struct cpu_stop_work active_bt_balance_work; +#endif + + /* cpu of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; +#ifdef CONFIG_BT_SCHED + struct list_head bt_tasks; +#endif + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#ifdef CONFIG_BT_SCHED + u64 idle_bt_stamp; + u64 avg_idle_bt; +#endif + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + call_single_data_t hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + + +#ifdef CONFIG_SCHED_SMT +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + +#define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(runqueues) + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return READ_ONCE(rq->clock); +} + +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_task; +} + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_update_flags |= RQCF_REQ_SKIP; + else + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + +#ifdef CONFIG_NUMA +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_SMP + +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_held(&rq->lock); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + +extern void sched_ttwu_pending(void); + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The cpu whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + +DECLARE_PER_CPU(int, sd_llc_size); +DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_asym); + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity + * for a single CPU. + */ + unsigned long capacity; +#ifdef CONFIG_BT_SCHED + unsigned long capacity_orig; + unsigned long capacity_bt; +#endif + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + +#ifdef CONFIG_BT_SCHED + int bt_balance_cpu; +#endif + unsigned int group_weight; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* cpu of highest priority in group */ + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +static inline struct cpumask *sched_group_span(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * See build_balance_mask(). + */ +static inline struct cpumask *group_balance_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_span(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + +#else + +static inline void sched_ttwu_pending(void) { } + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "autogroup.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) || \ + defined(CONFIG_BT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + p->bt.bt_rq = tg->bt_rq[cpu]; + p->bt.parent = tg->bt[cpu]; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else + task_thread_info(p)->cpu = cpu; +#endif + p->wake_cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_key_##enabled(key); \ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifdef CONFIG_BT_SCHED +static inline u64 global_bt_period(void) +{ + return (u64)sysctl_sched_bt_period * NSEC_PER_USEC; +} + +static inline u64 global_bt_runtime(void) +{ + if (sysctl_sched_bt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_bt_runtime * NSEC_PER_USEC; +} +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ + +extern inline void update_load_add(struct load_weight *lw, unsigned long inc); +extern inline void update_load_sub(struct load_weight *lw, unsigned long dec); +extern inline void update_load_set(struct load_weight *lw, unsigned long w); +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 + +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x40 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + +#define RETRY_TASK ((void *)-1UL) + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p); + + void (*task_woken) (struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); + + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + + void (*update_curr) (struct rq *rq); + +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_BT_GROUP_SCHED) + void (*task_change_group) (struct task_struct *p, int type); +#endif +}; + +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + +static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +{ + curr->sched_class->set_curr_task(rq); +} + +#ifdef CONFIG_SMP +#define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void update_group_capacity(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq); + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + +extern int idle_balance_bt(struct rq *this_rq, struct rq_flags *rf); + +extern int idle_balance(struct rq *this_rq, struct rq_flags *rf); + +#if defined(CONFIG_BT_GROUP_SCHED) +extern void idle_enter_bt(struct rq *this_rq); +extern void idle_exit_bt(struct rq *this_rq); +#else +static inline void idle_enter_bt(struct rq *this_rq) {} +static inline void idle_exit_bt(struct rq *this_rq) {} +#endif + +#else +int idle_balance_bt(struct rq *this_rq, struct rq_flags *rf) +{ + return 0; +} + +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +{ + return 0; +} +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + SCHED_WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +extern void schedule_idle(void); + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); + +extern void init_sched_dl_class(void); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); +#ifdef CONFIG_BT_SCHED +extern void init_sched_bt_class(void); +extern void update_idle_cpu_bt_load(struct rq *this_rq); +extern void init_bt_entity_runnable_average(struct sched_entity *se); +extern void post_init_bt_entity_util_avg(struct sched_entity *se); +#endif + +extern void resched_curr(struct rq *rq); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 +unsigned long to_ratio(u64 period, u64 runtime); + +extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; + +#ifndef CONFIG_BT_SCHED + if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + } +#else +#ifdef CONFIG_SMP + if (!rq->rd->overload && (rq->nr_running - rq->bt_nr_running >= 2)) + rq->rd->overload = true; +#endif + + if (rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (rq->bt_nr_running && !rq->rd->overload_bt) + rq->rd->overload_bt = true; +#endif + } +#endif + + sched_update_tick_dependency(rq); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); +} + +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +enum rq_nohz_flag_bits { + NOHZ_TICK_STOPPED, + NOHZ_BALANCE_KICK, +}; + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(unsigned int cpu); +#else +static inline void nohz_balance_exit_idle(unsigned int cpu) { } +#endif + + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +}; + +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ +static inline u64 irq_time_read(int cpu) +{ + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); + if (data) + data->func(data, rq_clock(rq), flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif diff --git a/ops/os_stat/os_stat/include_tk3/kernel/sched/stats.h b/ops/os_stat/os_stat/include_tk3/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..baf500d12b7c9eaa951657598b781b6b67ab8b3e --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/kernel/sched/stats.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_SCHED_INFO +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. + */ +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(rq, delta); +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct rq *rq, struct task_struct *t) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + + rq_sched_info_arrive(rq, delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); +} + +/* + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = rq_clock(rq) - + t->sched_info.last_arrival; + + rq_sched_info_depart(rq, delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(rq, t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(rq, prev); + + if (next != rq->idle) + sched_info_arrive(rq, next); +} +static inline void +sched_info_switch(struct rq *rq, + struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(rq, prev, next); +} +#else +#define sched_info_queued(rq, t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(rq, t) do { } while (0) +#define sched_info_depart(rq, t) do { } while (0) +#define sched_info_arrive(rq, next) do { } while (0) +#define sched_info_switch(rq, t, next) do { } while (0) +#endif /* CONFIG_SCHED_INFO */ diff --git a/ops/os_stat/os_stat/include_tk3/mm/slab.h b/ops/os_stat/os_stat/include_tk3/mm/slab.h new file mode 100644 index 0000000000000000000000000000000000000000..485d9fbb8802f85e8df71d3004e640c3b79b9d5f --- /dev/null +++ b/ops/os_stat/os_stat/include_tk3/mm/slab.h @@ -0,0 +1,521 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + unsigned long flags; /* Active flags on the slab */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned long size; +} kmalloc_info[]; + +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(unsigned long); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ return NULL; } + +static inline unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT) + +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the object listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params.root_cache; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return p == s || p == s->memcg_params.root_cache; +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + s = s->memcg_params.root_cache; + return s->name; +} + +/* + * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. + * That said the caller must assure the memcg's cache won't go away by either + * taking a css reference to the owner cgroup, or holding the slab_mutex. + */ +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + struct kmem_cache *cachep; + struct memcg_cache_array *arr; + + rcu_read_lock(); + arr = rcu_dereference(s->memcg_params.memcg_caches); + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match this (see + * memcg_create_kmem_cache()). + */ + cachep = READ_ONCE(arr->entries[idx]); + rcu_read_unlock(); + + return cachep; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params.root_cache; +} + +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) +{ + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + if (!memcg_kmem_enabled()) + return; + memcg_kmem_uncharge(page, order); +} + +extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s); +extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)); + +#else /* CONFIG_MEMCG && !CONFIG_SLOB */ + +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} + +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} + +static inline void memcg_link_cache(struct kmem_cache *s) +{ +} + +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); + WARN_ON_ONCE(1); + return s; +} + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + if (memcg_kmem_enabled() && + ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) + return memcg_kmem_get_cache(s); + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object, flags); + } + + if (memcg_kmem_enabled()) + memcg_kmem_put_cache(s); +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +void *slab_start(struct seq_file *m, loff_t *pos); +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +#endif /* MM_SLAB_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/drivers/block/loop.h b/ops/os_stat/os_stat/include_tk4_arm/drivers/block/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..af75a5ee409440b24ab633d3a744ed9f8c307256 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/drivers/block/loop.h @@ -0,0 +1,94 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + atomic_t lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct kthread_work work; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *css; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/ops/os_stat/os_stat/include_tk4_arm/drivers/target/loop.h b/ops/os_stat/os_stat/include_tk4_arm/drivers/target/loop.h new file mode 100644 index 0000000000000000000000000000000000000000..af75a5ee409440b24ab633d3a744ed9f8c307256 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/drivers/target/loop.h @@ -0,0 +1,94 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + atomic_t lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + int lo_state; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; + bool sysfs_inited; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct kthread_work work; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ + long ret; + struct kiocb iocb; + struct bio_vec *bvec; + struct cgroup_subsys_state *css; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/ops/os_stat/os_stat/include_tk4_arm/drivers/target/target_core_file.h b/ops/os_stat/os_stat/include_tk4_arm/drivers/target/target_core_file.h new file mode 100644 index 0000000000000000000000000000000000000000..929b1ecd544ee0ffb84973b64867a3dabb8a2f45 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/drivers/target/target_core_file.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef TARGET_CORE_FILE_H +#define TARGET_CORE_FILE_H + +#include + +#define FD_VERSION "4.0" + +#define FD_MAX_DEV_NAME 256 +#define FD_MAX_DEV_PROT_NAME FD_MAX_DEV_NAME + 16 +#define FD_DEVICE_QUEUE_DEPTH 32 +#define FD_MAX_DEVICE_QUEUE_DEPTH 128 +#define FD_BLOCKSIZE 512 +/* + * Limited by the number of iovecs (2048) per vfs_[writev,readv] call + */ +#define FD_MAX_BYTES 8388608 + +#define RRF_EMULATE_CDB 0x01 +#define RRF_GOT_LBA 0x02 + +#define FBDF_HAS_PATH 0x01 +#define FBDF_HAS_SIZE 0x02 +#define FDBD_HAS_BUFFERED_IO_WCE 0x04 +#define FDBD_HAS_ASYNC_IO 0x08 +#define FDBD_FORMAT_UNIT_SIZE 2048 + +struct fd_dev { + struct se_device dev; + + u32 fbd_flags; + unsigned char fd_dev_name[FD_MAX_DEV_NAME]; + /* Unique Ramdisk Device ID in Ramdisk HBA */ + u32 fd_dev_id; + /* Number of SG tables in sg_table_array */ + u32 fd_table_count; + u32 fd_queue_depth; + u32 fd_block_size; + unsigned long long fd_dev_size; + struct file *fd_file; + struct file *fd_prot_file; + /* FILEIO HBA device is connected to */ + struct fd_host *fd_host; +} ____cacheline_aligned; + +struct fd_host { + u32 fd_host_dev_id_count; + /* Unique FILEIO Host ID */ + u32 fd_host_id; +} ____cacheline_aligned; + +#endif /* TARGET_CORE_FILE_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/ext4.h b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..ca699af28e4de8e1f50a56a198ab9addaa0472a2 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/ext4.h @@ -0,0 +1,3446 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +{ + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec64 *time, + __le32 extra) +{ + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ + } \ + else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + atomic_t i_prealloc_active; + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_pad[2]; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* + * Flags for ext4_sb_info.s_encoding_flags. + */ +#define EXT4_ENC_STRICT_MODE_FL (1 << 0) + +#define ext4_has_strict_mode(sbi) \ + (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#ifdef CONFIG_UNICODE + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#ifdef CONFIG_UNICODE +extern void ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct fscrypt_str *fname); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); +#endif + return 0; +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *, unsigned int); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +#ifdef CONFIG_UNICODE +extern const struct dentry_operations ext4_dentry_ops; +#endif + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int ext4_ci_compare(const struct inode *parent, + const struct qstr *fname, + const struct qstr *entry, bool quick); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/extents_status.h b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..80a62ee17a81d073368dbec55e93c108b413b4fb --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_new/extents_status.h @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/ext4.h b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/ext4.h new file mode 100644 index 0000000000000000000000000000000000000000..f1cc8f7de279a1630d9091092a14e5cfbb3a11a1 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/ext4.h @@ -0,0 +1,3444 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#include +#include + +#include + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 + +/* + * For converting unwritten extents on a work queue. 'handle' is used for + * buffered writeback. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ + struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ + unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +} ext4_io_end_t; + +struct ext4_io_submit { + struct writeback_control *io_wbc; + struct bio *io_bio; + ext4_io_end_t *io_end; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; +}; + +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ + EXT4_PROJINHERIT_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ENCRYPT); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 mdata_blocks; + __u32 free_clusters_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unwritten + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unwritten extents if not allocated, split the unwritten + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + +/* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + +#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +/* + * We use an encoding that preserves the times for extra epoch "00": + * + * extra msb of adjust for signed + * epoch 32-bit 32-bit tv_sec to + * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range + * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 + * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 + * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 + * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 + * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 + * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 + * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 + * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 + * + * Note that previous versions of the kernel on 64-bit systems would + * incorrectly use extra epoch bits 1,1 for dates between 1901 and + * 1970. e2fsck will correct this, assuming that it is run on the + * affected filesystem before 2242. + */ + +static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +{ + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); +} + +static inline void ext4_decode_extra_time(struct timespec64 *time, + __le32 extra) +{ + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) + time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ + } \ + else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + } \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_checksum_lo osd2.linux2.l_i_checksum_lo + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +#include "extents_status.h" + +/* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is used for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + + /* + * File creation time. Its function is same as that of + * struct timespec64 i_{a,c,m}time in the generic inode. + */ + struct timespec64 i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + struct list_head i_es_list; + unsigned int i_es_all_nr; /* protected by i_es_lock */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* Lock protecting lists below */ + spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; + +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; + + kprojid_t i_projid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags set via mount options or defaults + */ +#define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ + +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_checksum_type; /* metadata checksum algorithm used */ + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __u8 s_wtime_hi; + __u8 s_mtime_hi; + __u8 s_mkfs_time_hi; + __u8 s_lastcheck_hi; + __u8 s_first_error_time_hi; + __u8 s_last_error_time_hi; + __u8 s_pad[2]; + __le16 s_encoding; /* Filename charset encoding */ + __le16 s_encoding_flags; /* Filename charset encoding flags */ + __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 3 + +#define EXT4_ENC_UTF8_12_1 1 + +/* + * Flags for ext4_sb_info.s_encoding_flags. + */ +#define EXT4_ENC_STRICT_MODE_FL (1 << 0) + +#define ext4_has_strict_mode(sbi) \ + (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + struct super_block *s_sb; +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_ext4_flags; /* Ext4 superblock flags */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct ext4_system_blocks __rcu *system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + unsigned int s_group_info_size; + unsigned int s_mb_free_pending; + struct list_head s_freed_data_list; /* List of blocks to be freed + after commit completed */ + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; + struct mb_cache *s_ea_block_cache; + struct mb_cache *s_ea_inode_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ + struct ratelimit_state s_err_ratelimit_state; + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag. + */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read + nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_flag(struct inode *inode, int bit); +static inline void ext4_set_inode_flag(struct inode *inode, int bit); +static inline void ext4_clear_inode_flag(struct inode *inode, int bit); +EXT4_INODE_BIT_FNS(flag, flags, 0) + +/* Add these declarations here only so that these functions can be + * found by name. Otherwise, they are very hard to locate. */ +static inline int ext4_test_inode_state(struct inode *inode, int bit); +static inline void ext4_set_inode_state(struct inode *inode, int bit); +static inline void ext4_clear_inode_state(struct inode *inode, int bit); +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + +/* + * Feature set definitions + */ + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 + +extern void ext4_update_dynamic_rev(struct super_block *sb); + +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + ext4_update_dynamic_rev(sb); \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) +EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + +/* + * Superblock flags + */ +#define EXT4_FLAGS_RESIZING 0 +#define EXT4_FLAGS_SHUTDOWN 1 + +static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +{ + return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); +} + + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +#define EXT4_FT_DIR_CSUM 0xDE + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ + !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + *(u32 *)desc.ctx = crc; + + BUG_ON(crypto_shash_update(&desc.shash, address, length)); + + return *(u32 *)desc.ctx; +} + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + +struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_str crypto_buf; +#endif +#ifdef CONFIG_UNICODE + struct fscrypt_str cf_name; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +static inline bool ext4_is_quota_file(struct inode *inode) +{ + return IS_NOQUOTA(inode) && + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) + +/* htree levels for ext4 */ +#define EXT4_HTREE_LEVEL_COMPAT 2 +#define EXT4_HTREE_LEVEL 3 + +static inline int ext4_dir_htree_level(struct super_block *sb) +{ + return ext4_has_feature_largedir(sb) ? + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; +} + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh); + +/* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +#ifdef CONFIG_UNICODE +extern void ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct fscrypt_str *fname); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); +#endif + return 0; +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#else /* !CONFIG_FS_ENCRYPTION */ +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, + struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + +#ifdef CONFIG_UNICODE + ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); +#endif + + return 0; +} + +static inline int ext4_fname_prepare_lookup(struct inode *dir, + struct dentry *dentry, + struct ext4_filename *fname) +{ + return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); +} + +static inline void ext4_fname_free_filename(struct ext4_filename *fname) +{ +#ifdef CONFIG_UNICODE + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} +#endif /* !CONFIG_FS_ENCRYPTION */ + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (buf), (size), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct fscrypt_str *ent_name); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } +} +static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); + +/* hash.c */ +extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, + struct dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, + int nblocks); + +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ + i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + 0, (type), __LINE__, (nblocks)) + + +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern const struct seq_operations ext4_mb_seq_groups_ops; +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_alloc_groupinfo(struct super_block *sb, + ext4_group_t ngroups); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); +extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + +/* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); +int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, + bool wait, struct buffer_head **bhs); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 + +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); +extern int ext4_can_truncate(struct inode *inode); +extern int ext4_truncate(struct inode *); +extern int ext4_break_layouts(struct inode *); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); + +/* namei.c */ +extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); +extern bool ext4_empty_dir(struct inode *inode); + +/* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); +extern int ext4_calculate_overhead(struct super_block *sb); +extern void ext4_superblock_csum_set(struct super_block *sb); +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern int ext4_alloc_flex_bg_array(struct super_block *sb, + ext4_group_t ngroup); +extern const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); + +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); +extern __printf(3, 4) +void __ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed); + +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && + !EXT4_SB(sb)->s_chksum_driver); + + return ext4_has_feature_metadata_csum(sb) && + (EXT4_SB(sb)->s_chksum_driver != NULL); +} + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); +} + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct super_block *sb, + struct ext4_inode *raw_inode) +{ + if (ext4_has_feature_largedir(sb) || + S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); +} + +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +#ifdef CONFIG_UNICODE +extern const struct dentry_operations ext4_dentry_ops; +#endif + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* inline.c */ +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct inode *dir, struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + struct dir_context *ctx, + int *has_inline_data); +extern int ext4_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + +extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); + +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void ext4_initialize_dirent_tail(struct buffer_head *bh, + unsigned int blocksize); +extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh); +extern int ext4_ci_compare(const struct inode *parent, + const struct qstr *fname, + const struct qstr *entry, bool quick); + +#define S_SHIFT 12 +static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (ext4_has_feature_filetype(sb)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); + +/* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path **, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); + +/* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc, + bool keep_towrite); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + +/* + * Add new method to test whether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +/* + * Disable DIO read nolock optimization, so new dioreaders will be forced + * to grab i_mutex + */ +static inline void ext4_inode_block_unlocked_dio(struct inode *inode) +{ + ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); + smp_mb(); +} +static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb(); + ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; + +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_unwritten); + } +} + +static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) +{ + struct inode *inode = io_end->inode; + + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); + } +} + +extern const struct iomap_ops ext4_iomap_ops; + +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + +#endif /* __KERNEL__ */ + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _EXT4_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/extents_status.h b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/extents_status.h new file mode 100644 index 0000000000000000000000000000000000000000..80a62ee17a81d073368dbec55e93c108b413b4fb --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/ext4_old/extents_status.h @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang + * Modified by + * Allison Henderson + * Zheng Liu + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* + * These flags live in the high bits of extent_status.es_pblk + */ +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; + +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) + +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) + +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) + +struct ext4_sb_info; +struct ext4_extent; + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +struct ext4_es_stats { + unsigned long es_stats_shrunk; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_shk_cnt; +}; + +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, + struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); + +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; +} + +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + +static inline void ext4_es_set_referenced(struct extent_status *es) +{ + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return es->es_pblk & ~ES_MASK; +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); +} + +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); +} + +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); + +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + +extern unsigned int ext4_shrink_es_timeout; +extern unsigned int ext4_shrink_es_timeout_min; + +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/fuse/fuse_i.h b/ops/os_stat/os_stat/include_tk4_arm/fs/fuse/fuse_i.h new file mode 100644 index 0000000000000000000000000000000000000000..5f14cf8e08ffd3abbc3352f92c60b40cdf476ec4 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/fuse/fuse_i.h @@ -0,0 +1,1111 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2008 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + +#ifndef pr_fmt +# define pr_fmt(fmt) "fuse: " fmt +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 + +/** Bias for fibbwritectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 5 + +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; + +/** Module parameters */ +extern unsigned max_user_bgreq; +extern unsigned max_user_congthresh; + +/* One forget request */ +struct fuse_forget_link { + struct fuse_forget_one forget_one; + struct fuse_forget_link *next; +}; + +/** FUSE inode */ +struct fuse_inode { + /** Inode data */ + struct inode inode; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** Number of lookups on this inode */ + u64 nlookup; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; + + /** Time in jiffies until the file attributes are valid */ + u64 i_time; + + /* Which attributes are invalid */ + u32 inval_mask; + + /** The sticky bit in inodebbi_mode may have been removed, so + preserve the original mode */ + umode_t orig_i_mode; + + /** 64 bit inode number */ + u64 orig_ino; + + /** Version of last attribute change */ + u64 attr_version; + + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fibblock */ + struct list_head write_files; + + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; + + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /* List of writepage requestst (pending or sent) */ + struct list_head writepages; + }; + + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; + + /* size of cache */ + loff_t size; + + /* position at end of cache (position of next entry) */ + loff_t pos; + + /* version of the cache */ + u64 version; + + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; + + /* iversion of directory when cache was started */ + u64 iversion; + + /* protects above fields */ + spinlock_t lock; + } rdc; + }; + + /** Miscellaneous bits describing inode state */ + unsigned long state; + + /** Lock for serializing lookup and readdir for back compatibility*/ + struct mutex mutex; + + /** Lock to protect write related fields */ + spinlock_t lock; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, + /** Initialized with readdirplus */ + FUSE_I_INIT_RDPLUS, + /** An operation changing file size is in progress */ + FUSE_I_SIZE_UNSTABLE, + /* Bad inode */ + FUSE_I_BAD, +}; + +struct fuse_conn; +struct fuse_release_args; + +/** FUSE specific file data */ +struct fuse_file { + /** Fuse connection for this file */ + struct fuse_conn *fc; + + /* Argument space reserved for release */ + struct fuse_release_args *release_args; + + /** Kernel file handle guaranteed to be unique */ + u64 kh; + + /** File handle used by userspace */ + u64 fh; + + /** Node id of this file */ + u64 nodeid; + + /** Refcount */ + refcount_t count; + + /** FOPEN_* flags returned by open */ + u32 open_flags; + + /** Entry on inode's write_files list */ + struct list_head write_entry; + + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + + } readdir; + + /** RB node to be linked on fuse_connbbpolled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; +}; + +/** One input argument of a request */ +struct fuse_in_arg { + unsigned size; + const void *value; +}; + +/** One output argument of a request */ +struct fuse_arg { + unsigned size; + void *value; +}; + +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + +struct fuse_args { + uint64_t nodeid; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool force:1; + bool noreply:1; + bool nocreds:1; + bool in_pages:1; + bool out_pages:1; + bool user_pages:1; + bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; + bool may_block:1; + struct fuse_in_arg in_args[3]; + struct fuse_arg out_args[2]; + void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); +}; + +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; +}; + +#define FUSE_ARGS(args) struct fuse_args args = {} + +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { + struct kref refcnt; + int async; + spinlock_t lock; + unsigned reqs; + ssize_t bytes; + size_t size; + __u64 offset; + bool write; + bool should_dirty; + int err; + struct kiocb *iocb; + struct completion *done; + bool blocking; +}; + +#define FUSE_IO_PRIV_SYNC(i) \ +{ \ + .refcnt = KREF_INIT(1), \ + .async = 0, \ + .iocb = i, \ +} + +/** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + * FR_ASYNC: request is asynchronous + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, + FR_ASYNC, +}; + +/** + * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fcbblock, tested under both) + */ +struct fuse_req { + /** This can be on either pending processing or io lists in + fuse_conn */ + struct list_head list; + + /** Entry on the interrupts list */ + struct list_head intr_entry; + + /* Input/output arguments */ + struct fuse_args *args; + + /** refcount */ + refcount_t count; + + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; + + /* The request input header */ + struct { + struct fuse_in_header h; + } in; + + /* The request output header */ + struct { + struct fuse_out_header h; + } out; + + /** Used to wake up the task waiting for completion of request*/ + wait_queue_head_t waitq; + +#if IS_ENABLED(CONFIG_VIRTIO_FS) + /** virtio-fs's physically contiguous buffer for in and out args */ + void *argbuf; +#endif +}; + +struct fuse_iqueue; + +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiqbbwaitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiqbblock); + + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiqbblock); + + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiqbblock); + + /** + * Clean up when fuse_iqueue is destroyed + */ + void (*release)(struct fuse_iqueue *fiq); +}; + +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; +}; + +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Hash table of requests being processed */ + struct list_head *processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fcbbdevices */ + struct list_head entry; +}; + +struct fuse_fs_context { + int fd; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + bool destroy:1; + bool no_control:1; + bool no_force_umount:1; + bool no_mount_options:1; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + +/** + * A Fuse connection. + * + * This structure is created, when the filesystem is mounted, and is + * destroyed, when the client device is closed and the filesystem is + * unmounted. + */ +struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** Refcount */ + refcount_t count; + + /** Number of fuse_dev's */ + atomic_t dev_count; + + struct rcu_head rcu; + + /** The user id for this mount */ + kuid_t user_id; + + /** The group id for this mount */ + kgid_t group_id; + + /** The pid namespace for this mount */ + struct pid_namespace *pid_ns; + + /** The user namespace for this mount */ + struct user_namespace *user_ns; + + /** Maximum read size */ + unsigned max_read; + + /** Maximum write size */ + unsigned max_write; + + /** Maxmum number of pages that can be used in a single request */ + unsigned int max_pages; + + /** Input queue */ + struct fuse_iqueue iq; + + /** The next unique kernel file handle */ + atomic64_t khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + + /** Number of requests currently in the background */ + unsigned num_background; + + /** Number of background requests currently queued for userspace */ + unsigned active_background; + + /** The list of background requests set aside for later queuing */ + struct list_head bg_queue; + + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + + /** Flag indicating that INIT reply has been received. Allocating + * any fuse request will be suspended until the flag is set */ + int initialized; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + + /** Connection established, cleared on umount, connection + abort and device release */ + unsigned connected; + + /** Connection aborted via sysfs */ + bool aborted; + + /** Connection failed (version mismatch). Cannot race with + setting other bitfields since it is only set once in INIT + reply, before any other request, and never cleared */ + unsigned conn_error:1; + + /** Connection successful. Only set in INIT */ + unsigned conn_init:1; + + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read:1; + + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + + /** Do not send separate SETATTR request before open(O_TRUNC) */ + unsigned atomic_o_trunc:1; + + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support:1; + + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + + /** allow parallel lookups and readdir (default is serialized) */ + unsigned parallel_dirops:1; + + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + + /* + * The following bitfields are only for optimization purposes + * and hence races in setting them will not cause malfunction + */ + + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + + /** Is fsync not implemented by fs? */ + unsigned no_fsync:1; + + /** Is fsyncdir not implemented by fs? */ + unsigned no_fsyncdir:1; + + /** Is flush not implemented by fs? */ + unsigned no_flush:1; + + /** Is setxattr not implemented by fs? */ + unsigned no_setxattr:1; + + /** Is getxattr not implemented by fs? */ + unsigned no_getxattr:1; + + /** Is listxattr not implemented by fs? */ + unsigned no_listxattr:1; + + /** Is removexattr not implemented by fs? */ + unsigned no_removexattr:1; + + /** Are posix file locking primitives not implemented by fs? */ + unsigned no_lock:1; + + /** Is access not implemented by fs? */ + unsigned no_access:1; + + /** Is create not implemented by fs? */ + unsigned no_create:1; + + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt:1; + + /** Is bmap not implemented by fs? */ + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; + + /** Do multi-page cached writes */ + unsigned big_writes:1; + + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + + /** Is fallocate not implemented by fs? */ + unsigned no_fallocate:1; + + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + + /** Use enhanced/automatic page cache invalidation. */ + unsigned auto_inval_data:1; + + /** Filesystem is fully reponsible for page cache invalidation. */ + unsigned explicit_inval_data:1; + + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + + /** Does the filesystem support asynchronous direct-IO submission? */ + unsigned async_dio:1; + + /** Is lseek not implemented by fs? */ + unsigned no_lseek:1; + + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + + /* Send DESTROY request */ + unsigned int destroy:1; + + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + + /* Do not show mount options */ + unsigned int no_mount_options:1; + + /** The number of requests waiting for completion */ + atomic_t num_waiting; + + /** Negotiated minor version */ + unsigned minor; + + /** Entry on the fuse_conn_list */ + struct list_head entry; + + /** Device ID from super block */ + dev_t dev; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; + + /** Version counter for attribute changes */ + atomic64_t attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); + + /** Super block for this connection. */ + struct super_block *sb; + + /** Read/write semaphore to hold when accessing sb. */ + struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; +}; + +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + return sbbbs_fs_info; +} + +static inline struct fuse_conn *get_fuse_conn(struct inode *inode) +{ + return get_fuse_conn_super(inodebbi_sb); +} + +static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +{ + return container_of(inode, struct fuse_inode, inode); +} + +static inline u64 get_node_id(struct inode *inode) +{ + return get_fuse_inode(inode)bbnodeid; +} + +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + +static inline u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + return atomic64_read(&fcbbattr_version); +} + +static inline void fuse_make_bad(struct inode *inode) +{ + remove_inode_hash(inode); + set_bit(FUSE_I_BAD, &get_fuse_inode(inode)bbstate); +} + +static inline bool fuse_is_bad(struct inode *inode) +{ + return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)bbstate)); +} + +/** Device operations */ +extern const struct file_operations fuse_dev_operations; + +extern const struct dentry_operations fuse_dentry_operations; +extern const struct dentry_operations fuse_root_dentry_operations; + +/** + * Inode to nodeid comparison. + */ +int fuse_inode_eq(struct inode *inode, void *_nodeidp); + +/** + * Get a filled in inode + */ +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, + int generation, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + +/** + * Send FORGET command + */ +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, + u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); + +/* + * Initialize READ or READDIR request + */ +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + bool page_locked; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + + +/** + * Send OPEN or OPENDIR request + */ +int fuse_open_common(struct inode *inode, struct file *file, bool isdir); + +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +void fuse_file_free(struct fuse_file *ff); +void fuse_finish_open(struct inode *inode, struct file *file); + +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); + +/** + * Send RELEASE or RELEASEDIR request + */ +void fuse_release_common(struct file *file, bool isdir); + +/** + * Send FSYNC or FSYNCDIR request + */ +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int opcode); + +/** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** + * Initialize file operations on a regular file + */ +void fuse_init_file_inode(struct inode *inode); + +/** + * Initialize inode operations on regular files and special files + */ +void fuse_init_common(struct inode *inode); + +/** + * Initialize inode and file operations on a directory + */ +void fuse_init_dir(struct inode *inode); + +/** + * Initialize inode operations on a symlink + */ +void fuse_init_symlink(struct inode *inode); + +/** + * Change attributes of an inode + */ +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version); + +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid); + +/** + * Initialize the client device + */ +int fuse_dev_init(void); + +/** + * Cleanup the client device + */ +void fuse_dev_cleanup(void); + +int fuse_ctl_init(void); +void __exit fuse_ctl_cleanup(void); + +/** + * Simple request sending that does request allocation and freeing + */ +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags); + +/** + * End a finished request + */ +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); + +/* Abort all requests */ +void fuse_abort_conn(struct fuse_conn *fc); +void fuse_wait_aborted(struct fuse_conn *fc); + +/** + * Invalidate inode attributes + */ +void fuse_invalidate_attr(struct inode *inode); + +void fuse_invalidate_entry_cache(struct dentry *entry); + +void fuse_invalidate_atime(struct inode *inode); + +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +/** + * Initialize fuse_conn + */ +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_conn *fc); + +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + +/** + * Disassociate fuse connection from superblock and kill the superblock + * + * Calls kill_anon_super(), do not use with bdev mounts. + */ +void fuse_kill_sb_anon(struct super_block *sb); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); + +/** + * Is file type valid? + */ +int fuse_valid_type(int m); + +bool fuse_invalid_attr(struct fuse_attr *attr); + +/** + * Is current process allowed to perform filesystem operation? + */ +int fuse_allow_current_process(struct fuse_conn *fc); + +u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); + +void fuse_update_ctime(struct inode *inode); + +int fuse_update_attributes(struct inode *inode, struct file *file); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). + */ +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + u64 child_nodeid, struct qstr *name); + +int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, + bool isdir); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mappingbbhost is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags); +long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); +__poll_t fuse_file_poll(struct file *file, poll_table *wait); +int fuse_dev_release(struct inode *inode, struct file *file); + +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); + +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, + struct file *file); + +void fuse_set_initialized(struct fuse_conn *fc); + +void fuse_unlock_inode(struct inode *inode, bool locked); +bool fuse_lock_inode(struct inode *inode); + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); +extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; +extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type); +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); + +#endif /* _FS_FUSE_I_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/proc/internal.h b/ops/os_stat/os_stat/include_tk4_arm/fs/proc/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..1d9488e24fc8e3caccade55debd5e76a9e19013a --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/proc/internal.h @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct ctl_table_header; +struct mempolicy; + +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. + */ +struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + refcount_t refcnt; + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; + proc_write_t write; + void *data; + unsigned int state_size; + unsigned int low_ino; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; + char *name; + umode_t mode; + u8 namelen; + char inline_name[]; +} __randomize_layout; + +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) + +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); + const char *lsm; +}; + +struct proc_inode { + struct pid *pid; + unsigned int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct hlist_node sysctl_inodes; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; +} __randomize_layout; + +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} + +static inline struct pid *proc_pid(const struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(const struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +void task_dump_owner(struct task_struct *task, umode_t mode, + kuid_t *ruid, kgid_t *rgid); + +unsigned name_to_int(const struct qstr *qstr); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 + +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int host_pid_info(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +#ifdef CONFIG_DAMON_VADDR +extern int proc_damon_map(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +#endif +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); +extern void pid_update_inode(struct task_struct *, struct inode *); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef struct dentry *instantiate_t(struct dentry *, + struct task_struct *, const void *); +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); +extern int proc_readdir(struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); + +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + refcount_inc(&pde->refcnt); + return pde; +} +extern void pde_put(struct proc_dir_entry *); + +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); + +/* + * inode.c + */ +struct pde_opener { + struct file *file; + struct list_head lh; + bool closing; + struct completion *c; +} __randomize_layout; +extern const struct inode_operations proc_link_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; + +void proc_init_kmemcache(void); +void set_proc_pid_nlink(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern void proc_entry_rundown(struct proc_dir_entry *); + +/* + * proc_namespaces.c + */ +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); +#else +static inline void proc_sys_init(void) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); + +/* + * task_[no]mmu.c + */ +struct mem_size_stats; +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct mm_struct *mm; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +} __randomize_layout; + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +#ifdef CONFIG_MMU +extern void task_mem(struct seq_file *m, struct mm_struct *mm, struct task_struct *task); +#else +extern void task_mem(struct seq_file *, struct mm_struct *); +#endif + +extern const struct dentry_operations proc_net_dentry_ops; +static inline void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_log_priv.h b/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_log_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..b880c23cb6e4ffd78324ff26a2890c0010f67d64 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_log_priv.h @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_log is a pointer back to the global log structure. + * - ic_size is the full size of the log buffer, minus the cycle headers. + * - ic_io_size is the size of the currently pending log buffer write, which + * might be smaller than ic_size + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callbacks + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xlog *ic_log; + u32 ic_size; + u32 ic_io_size; + u32 ic_offset; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct list_head ic_callbacks; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +#ifdef DEBUG + bool ic_fail_crc : 1; +#endif + struct semaphore ic_sema; + struct work_struct ic_end_io_work; + struct bio ic_bio; + struct bio_vec ic_bvec[]; +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct list_head iclog_entry; + struct list_head committing; /* ctx committing list */ + struct work_struct discard_endio_work; +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; +#endif + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); +extern void +xlog_recover_cancel(struct xlog *); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +void xlog_print_trans(struct xfs_trans *); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = READ_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = READ_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_trans_priv.h b/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_trans_priv.h new file mode 100644 index 0000000000000000000000000000000000000000..2e073c1c4614f2a79cc9452854da1cead65fb06c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, bool aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *ail_mount; + struct task_struct *ail_task; + struct list_head ail_head; + xfs_lsn_t ail_target; + xfs_lsn_t ail_target_prev; + struct list_head ail_cursors; + spinlock_t ail_lock; + xfs_lsn_t ail_last_pushed_lsn; + int ail_log_flush; + struct list_head ail_buf_list; + wait_queue_head_t ail_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->ail_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->ail_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->ail_lock); + +static inline void +xfs_trans_ail_remove( + struct xfs_log_item *lip, + int shutdown_type) +{ + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->ail_lock); + /* xfs_trans_ail_delete() drops the AIL lock */ + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) + xfs_trans_ail_delete(ailp, lip, shutdown_type); + else + spin_unlock(&ailp->ail_lock); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->ail_lock); + *dst = *src; + spin_unlock(&ailp->ail_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); + lockdep_assert_held(&lip->li_ailp->ail_lock); + + if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->ail_lock); + + if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { + xfs_buf_hold(bp); + lip->li_buf = bp; + } +} + +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/include/linux/percpu-defs.h b/ops/os_stat/os_stat/include_tk4_arm/include/linux/percpu-defs.h new file mode 100644 index 0000000000000000000000000000000000000000..176bfbd52d97557ea754709f3b0bd2bb370a7852 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/include/linux/percpu-defs.h @@ -0,0 +1,527 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/percpu-defs.h - basic definitions for percpu areas + * + * DO NOT INCLUDE DIRECTLY OUTSIDE PERCPU IMPLEMENTATION PROPER. + * + * This file is separate from linux/percpu.h to avoid cyclic inclusion + * dependency from arch header files. Only to be included from + * asm/percpu.h. + * + * This file includes macros necessary to declare percpu sections and + * variables, and definitions of percpu accessors and operations. It + * should provide enough percpu features to arch header files even when + * they can only include asm/percpu.h to avoid cyclic inclusion dependency. + */ + +#ifndef _LINUX_PERCPU_DEFS_H +#define _LINUX_PERCPU_DEFS_H + +#ifdef CONFIG_SMP + +#ifdef MODULE +#define PER_CPU_SHARED_ALIGNED_SECTION "" +#define PER_CPU_ALIGNED_SECTION "" +#else +#define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" +#define PER_CPU_ALIGNED_SECTION "..shared_aligned" +#endif +#define PER_CPU_FIRST_SECTION "..first" + +#else + +#define PER_CPU_SHARED_ALIGNED_SECTION "" +#define PER_CPU_ALIGNED_SECTION "..shared_aligned" +#define PER_CPU_FIRST_SECTION "" + +#endif + +/* + * Base implementations of per-CPU variable declarations and definitions, where + * the section in which the variable is to be placed is provided by the + * 'sec' argument. This may be used to affect the parameters governing the + * variable's storage. + * + * NOTE! The sections for the DECLARE and for the DEFINE must match, lest + * linkage errors occur due the compiler generating the wrong code to access + * that section. + */ +#define __PCPU_ATTRS(sec) \ + __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES + +#define __PCPU_DUMMY_ATTRS \ + __attribute__((section(".discard"), unused)) + +/* + * s390 and alpha modules require percpu variables to be defined as + * weak to force the compiler to generate GOT based external + * references for them. This is necessary because percpu sections + * will be located outside of the usually addressable area. + * + * This definition puts the following two extra restrictions when + * defining percpu variables. + * + * 1. The symbol must be globally unique, even the static ones. + * 2. Static percpu variables cannot be defined inside a function. + * + * Archs which need weak percpu definitions should define + * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary. + * + * To ensure that the generic code observes the above two + * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak + * definition is used for all cases. + */ +#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU) +/* + * __pcpu_scope_* dummy variable is used to enforce scope. It + * receives the static modifier when it's used in front of + * DEFINE_PER_CPU() and will trigger build failure if + * DECLARE_PER_CPU() is used for the same variable. + * + * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness + * such that hidden weak symbol collision, which will cause unrelated + * variables to share the same address, can be detected during build. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_ATTRS(sec) __typeof__(type) name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + extern __PCPU_ATTRS(sec) __typeof__(type) name; \ + __PCPU_ATTRS(sec) __weak __typeof__(type) name +#else +/* + * Normal declaration and definition macros. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_ATTRS(sec) __typeof__(type) name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) __typeof__(type) name +#endif + +/* + * Variant on the per-CPU variable declaration/definition theme used for + * ordinary per-CPU variables. + */ +#define DECLARE_PER_CPU(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "") + +#define DEFINE_PER_CPU(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "") + +/* + * Declaration/definition used for per-CPU variables that must come first in + * the set of variables. + */ +#define DECLARE_PER_CPU_FIRST(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) + +#define DEFINE_PER_CPU_FIRST(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) + +/* + * Declaration/definition used for per-CPU variables that must be cacheline + * aligned under SMP conditions so that, whilst a particular instance of the + * data corresponds to a particular CPU, inefficiencies due to direct access by + * other CPUs are reduced by preventing the data from unnecessarily spanning + * cachelines. + * + * An example of this would be statistical data, where each CPU's set of data + * is updated by that CPU alone, but the data from across all CPUs is collated + * by a CPU processing a read from a proc file. + */ +#define DECLARE_PER_CPU_SHARED_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DECLARE_PER_CPU_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ + ____cacheline_aligned + +#define DEFINE_PER_CPU_ALIGNED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ + ____cacheline_aligned + +/* + * Declaration/definition used for per-CPU variables that must be page aligned. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ + __aligned(PAGE_SIZE) + +/* + * Declaration/definition used for per-CPU variables that must be read mostly. + */ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") + +#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..read_mostly") + +/* + * Declaration/definition used for per-CPU variables that should be accessed + * as decrypted when memory encryption is enabled in the guest. + */ +#ifdef CONFIG_AMD_MEM_ENCRYPT +#define DECLARE_PER_CPU_DECRYPTED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..decrypted") + +#define DEFINE_PER_CPU_DECRYPTED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..decrypted") +#else +#define DEFINE_PER_CPU_DECRYPTED(type, name) DEFINE_PER_CPU(type, name) +#endif + +/* + * Intermodule exports for per-CPU variables. sparse forgets about + * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to + * noop if __CHECKER__. + */ +#ifndef __CHECKER__ +#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var) +#else +#define EXPORT_PER_CPU_SYMBOL(var) +#define EXPORT_PER_CPU_SYMBOL_GPL(var) +#endif + +/* + * Accessors and operations. + */ +#ifndef __ASSEMBLY__ + +/* + * __verify_pcpu_ptr() verifies @ptr is a percpu pointer without evaluating + * @ptr and is invoked once before a percpu area is accessed by all + * accessors and operations. This is performed in the generic part of + * percpu and arch overrides don't need to worry about it; however, if an + * arch wants to implement an arch-specific percpu accessor or operation, + * it may use __verify_pcpu_ptr() to verify the parameters. + * + * + 0 is required in order to convert the pointer type from a + * potential array type to a pointer to a single item of the array. + */ +#define __verify_pcpu_ptr(ptr) \ +do { \ + const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \ + (void)__vpp_verify; \ +} while (0) + +#ifdef CONFIG_SMP + +/* + * Add an offset to a pointer but keep the pointer as-is. Use RELOC_HIDE() + * to prevent the compiler from making incorrect assumptions about the + * pointer value. The weird cast keeps both GCC and sparse happy. + */ +#define SHIFT_PERCPU_PTR(__p, __offset) \ + RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) + +#define per_cpu_ptr(ptr, cpu) \ +({ \ + __verify_pcpu_ptr(ptr); \ + SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))); \ +}) + +#define raw_cpu_ptr(ptr) \ +({ \ + __verify_pcpu_ptr(ptr); \ + arch_raw_cpu_ptr(ptr); \ +}) + +#ifdef CONFIG_DEBUG_PREEMPT +#define this_cpu_ptr(ptr) \ +({ \ + __verify_pcpu_ptr(ptr); \ + SHIFT_PERCPU_PTR(ptr, my_cpu_offset); \ +}) +#else +#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) +#endif + +#else /* CONFIG_SMP */ + +#define VERIFY_PERCPU_PTR(__p) \ +({ \ + __verify_pcpu_ptr(__p); \ + (typeof(*(__p)) __kernel __force *)(__p); \ +}) + +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); VERIFY_PERCPU_PTR(ptr); }) +#define raw_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) +#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) + +#endif /* CONFIG_SMP */ + +#define per_cpu(var, cpu) (*per_cpu_ptr(&(var), cpu)) + +/* + * Must be an lvalue. Since @var must be a simple identifier, + * we force a syntax error here if it isn't. + */ +#define get_cpu_var(var) \ +(*({ \ + preempt_disable(); \ + this_cpu_ptr(&var); \ +})) + +/* + * The weird & is necessary because sparse considers (void)(var) to be + * a direct dereference of percpu variable (var). + */ +#define put_cpu_var(var) \ +do { \ + (void)&(var); \ + preempt_enable(); \ +} while (0) + +#define get_cpu_ptr(var) \ +({ \ + preempt_disable(); \ + this_cpu_ptr(var); \ +}) + +#define put_cpu_ptr(var) \ +do { \ + (void)(var); \ + preempt_enable(); \ +} while (0) + +/* + * Branching function to split up a function into a set of functions that + * are called for different scalar sizes of the objects handled. + */ + +extern void __bad_size_call_parameter(void); + +#ifdef CONFIG_DEBUG_PREEMPT +extern void __this_cpu_preempt_check(const char *op); +#else +static inline void __this_cpu_preempt_check(const char *op) { } +#endif + +#define __pcpu_size_call_return(stem, variable) \ +({ \ + typeof(variable) pscr_ret__; \ + __verify_pcpu_ptr(&(variable)); \ + switch(sizeof(variable)) { \ + case 1: pscr_ret__ = stem##1(variable); break; \ + case 2: pscr_ret__ = stem##2(variable); break; \ + case 4: pscr_ret__ = stem##4(variable); break; \ + case 8: pscr_ret__ = stem##8(variable); break; \ + default: \ + __bad_size_call_parameter(); break; \ + } \ + pscr_ret__; \ +}) + +#define __pcpu_size_call_return2(stem, variable, ...) \ +({ \ + typeof(variable) pscr2_ret__; \ + __verify_pcpu_ptr(&(variable)); \ + switch(sizeof(variable)) { \ + case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ + case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break; \ + case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break; \ + case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \ + default: \ + __bad_size_call_parameter(); break; \ + } \ + pscr2_ret__; \ +}) + +/* + * Special handling for cmpxchg_double. cmpxchg_double is passed two + * percpu variables. The first has to be aligned to a double word + * boundary and the second has to follow directly thereafter. + * We enforce this on all architectures even if they don't support + * a double cmpxchg instruction, since it's a cheap requirement, and it + * avoids breaking the requirement for architectures with the instruction. + */ +#define __pcpu_double_call_return_bool(stem, pcp1, pcp2, ...) \ +({ \ + bool pdcrb_ret__; \ + __verify_pcpu_ptr(&(pcp1)); \ + BUILD_BUG_ON(sizeof(pcp1) != sizeof(pcp2)); \ + VM_BUG_ON((unsigned long)(&(pcp1)) % (2 * sizeof(pcp1))); \ + VM_BUG_ON((unsigned long)(&(pcp2)) != \ + (unsigned long)(&(pcp1)) + sizeof(pcp1)); \ + switch(sizeof(pcp1)) { \ + case 1: pdcrb_ret__ = stem##1(pcp1, pcp2, __VA_ARGS__); break; \ + case 2: pdcrb_ret__ = stem##2(pcp1, pcp2, __VA_ARGS__); break; \ + case 4: pdcrb_ret__ = stem##4(pcp1, pcp2, __VA_ARGS__); break; \ + case 8: pdcrb_ret__ = stem##8(pcp1, pcp2, __VA_ARGS__); break; \ + default: \ + __bad_size_call_parameter(); break; \ + } \ + pdcrb_ret__; \ +}) + +#define __pcpu_size_call(stem, variable, ...) \ +do { \ + __verify_pcpu_ptr(&(variable)); \ + switch(sizeof(variable)) { \ + case 1: stem##1(variable, __VA_ARGS__);break; \ + case 2: stem##2(variable, __VA_ARGS__);break; \ + case 4: stem##4(variable, __VA_ARGS__);break; \ + case 8: stem##8(variable, __VA_ARGS__);break; \ + default: \ + __bad_size_call_parameter();break; \ + } \ +} while (0) + +/* + * this_cpu operations (C) 2008-2013 Christoph Lameter + * + * Optimized manipulation for memory allocated through the per cpu + * allocator or for addresses of per cpu variables. + * + * These operation guarantee exclusivity of access for other operations + * on the *same* processor. The assumption is that per cpu data is only + * accessed by a single processor instance (the current one). + * + * The arch code can provide optimized implementation by defining macros + * for certain scalar sizes. F.e. provide this_cpu_add_2() to provide per + * cpu atomic operations for 2 byte sized RMW actions. If arch code does + * not provide operations for a scalar size then the fallback in the + * generic code will be used. + * + * cmpxchg_double replaces two adjacent scalars at once. The first two + * parameters are per cpu variables which have to be of the same size. A + * truth value is returned to indicate success or failure (since a double + * register result is difficult to handle). There is very limited hardware + * support for these operations, so only certain sizes may work. + */ + +/* + * Operations for contexts where we do not want to do any checks for + * preemptions. Unless strictly necessary, always use [__]this_cpu_*() + * instead. + * + * If there is no other protection through preempt disable and/or disabling + * interupts then one of these RMW operations can show unexpected behavior + * because the execution thread was rescheduled on another processor or an + * interrupt occurred and the same percpu variable was modified from the + * interrupt context. + */ +#define raw_cpu_read(pcp) __pcpu_size_call_return(raw_cpu_read_, pcp) +#define raw_cpu_write(pcp, val) __pcpu_size_call(raw_cpu_write_, pcp, val) +#define raw_cpu_add(pcp, val) __pcpu_size_call(raw_cpu_add_, pcp, val) +#define raw_cpu_and(pcp, val) __pcpu_size_call(raw_cpu_and_, pcp, val) +#define raw_cpu_or(pcp, val) __pcpu_size_call(raw_cpu_or_, pcp, val) +#define raw_cpu_add_return(pcp, val) __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val) +#define raw_cpu_xchg(pcp, nval) __pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval) +#define raw_cpu_cmpxchg(pcp, oval, nval) \ + __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval) +#define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2) + +#define raw_cpu_sub(pcp, val) raw_cpu_add(pcp, -(val)) +#define raw_cpu_inc(pcp) raw_cpu_add(pcp, 1) +#define raw_cpu_dec(pcp) raw_cpu_sub(pcp, 1) +#define raw_cpu_sub_return(pcp, val) raw_cpu_add_return(pcp, -(typeof(pcp))(val)) +#define raw_cpu_inc_return(pcp) raw_cpu_add_return(pcp, 1) +#define raw_cpu_dec_return(pcp) raw_cpu_add_return(pcp, -1) + +/* + * Operations for contexts that are safe from preemption/interrupts. These + * operations verify that preemption is disabled. + */ +#define __this_cpu_read(pcp) \ +({ \ + __this_cpu_preempt_check("read"); \ + raw_cpu_read(pcp); \ +}) + +#define __this_cpu_write(pcp, val) \ +({ \ + __this_cpu_preempt_check("write"); \ + raw_cpu_write(pcp, val); \ +}) + +#define __this_cpu_add(pcp, val) \ +({ \ + __this_cpu_preempt_check("add"); \ + raw_cpu_add(pcp, val); \ +}) + +#define __this_cpu_and(pcp, val) \ +({ \ + __this_cpu_preempt_check("and"); \ + raw_cpu_and(pcp, val); \ +}) + +#define __this_cpu_or(pcp, val) \ +({ \ + __this_cpu_preempt_check("or"); \ + raw_cpu_or(pcp, val); \ +}) + +#define __this_cpu_add_return(pcp, val) \ +({ \ + __this_cpu_preempt_check("add_return"); \ + raw_cpu_add_return(pcp, val); \ +}) + +#define __this_cpu_xchg(pcp, nval) \ +({ \ + __this_cpu_preempt_check("xchg"); \ + raw_cpu_xchg(pcp, nval); \ +}) + +#define __this_cpu_cmpxchg(pcp, oval, nval) \ +({ \ + __this_cpu_preempt_check("cmpxchg"); \ + raw_cpu_cmpxchg(pcp, oval, nval); \ +}) + +#define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ +({ __this_cpu_preempt_check("cmpxchg_double"); \ + raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2); \ +}) + +#define __this_cpu_sub(pcp, val) __this_cpu_add(pcp, -(typeof(pcp))(val)) +#define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1) +#define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1) +#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val)) +#define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1) +#define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1) + +/* + * Operations with implied preemption/interrupt protection. These + * operations can be used without worrying about preemption or interrupt. + */ +#define this_cpu_read(pcp) __pcpu_size_call_return(this_cpu_read_, pcp) +#define this_cpu_write(pcp, val) __pcpu_size_call(this_cpu_write_, pcp, val) +#define this_cpu_add(pcp, val) __pcpu_size_call(this_cpu_add_, pcp, val) +#define this_cpu_and(pcp, val) __pcpu_size_call(this_cpu_and_, pcp, val) +#define this_cpu_or(pcp, val) __pcpu_size_call(this_cpu_or_, pcp, val) +#define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val) +#define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval) +#define this_cpu_cmpxchg(pcp, oval, nval) \ + __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval) +#define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ + __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2) + +#define this_cpu_sub(pcp, val) this_cpu_add(pcp, -(typeof(pcp))(val)) +#define this_cpu_inc(pcp) this_cpu_add(pcp, 1) +#define this_cpu_dec(pcp) this_cpu_sub(pcp, 1) +#define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(typeof(pcp))(val)) +#define this_cpu_inc_return(pcp) this_cpu_add_return(pcp, 1) +#define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1) + +#endif /* __ASSEMBLY__ */ +#endif /* _LINUX_PERCPU_DEFS_H */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/autogroup.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/autogroup.h new file mode 100644 index 0000000000000000000000000000000000000000..b96419974a1f0e88cf3e0a7b203bde1fcff89b2c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/autogroup.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + /* + * Reference doesn't mean how many threads attach to this + * autogroup now. It just stands for the number of tasks + * which could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpudeadline.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpudeadline.h new file mode 100644 index 0000000000000000000000000000000000000000..1c1181cd473f25e57790aa2407a55ab7aa54cb5c --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpudeadline.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define IDX_INVALID -1 + +struct cpudl_item { + u64 dl; + int cpu; + int idx; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + cpumask_var_t free_cpus; + struct cpudl_item *elements; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); +int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); +void cpudl_cleanup(struct cpudl *cp); +#endif /* CONFIG_SMP */ diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpupri.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpupri.h new file mode 100644 index 0000000000000000000000000000000000000000..7dc20a3232e726b3b5f91389395f49d7525120a5 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/cpupri.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int *cpu_to_pri; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#endif diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/features.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/features.h new file mode 100644 index 0000000000000000000000000000000000000000..66c74aa4753e79c04d4c52d96a37525b514983ef --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/features.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) + +/* + * Decrement CPU capacity based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_CAPACITY, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +/* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ +SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) + +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + +SCHED_FEAT(RT_RUNTIME_SHARE, false) +SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + +SCHED_FEAT(WA_IDLE, true) +SCHED_FEAT(WA_WEIGHT, true) +SCHED_FEAT(WA_BIAS, true) + +/* + * UtilEstimation. Use estimated CPU utilization. + */ +SCHED_FEAT(UTIL_EST, true) diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/sched.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/sched.h new file mode 100644 index 0000000000000000000000000000000000000000..59ba2713373f65a3f1e842b37a817a78544499ec --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/sched.h @@ -0,0 +1,3215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal types and methods: + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_PARAVIRT +# include +#endif + +#include "cpupri.h" +#include "cpudeadline.h" + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + +struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 + +extern __read_mostly int scheduler_running; + +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; +#ifdef CONFIG_BT_SCHED +extern atomic_long_t calc_bt_load_tasks; +extern long calc_bt_load_fold_active(struct rq *this_rq, long adjust); +#endif + +extern void calc_global_load_tick(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Latency nice is meant to provide scheduler hints about the relative + * latency requirements of a task with respect to other tasks. + * Thus a task with latency_nice == 19 can be hinted as the task with no + * latency requirements, in contrast to the task with latency_nice == -20 + * which should be given priority in terms of lower latency. + */ +#define MAX_LATENCY_NICE 19 +#define MIN_LATENCY_NICE -20 + +#define LATENCY_NICE_WIDTH \ + (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) + +/* + * Default tasks should be treated as a task with latency_nice = 0. + */ +#define DEFAULT_LATENCY_NICE 0 +#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static latency [ 0..39 ], + * and back. + */ +#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) +#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) +#define NICE_LATENCY_SHIFT (SCHED_FIXEDPOINT_SHIFT) +#define NICE_LATENCY_WEIGHT_MAX (1L << NICE_LATENCY_SHIFT) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit + * are pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to + * increase coverage and consistency always enable it on 64-bit platforms. + */ +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ +}) +#else +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + + +#ifdef CONFIG_BT_SCHED +#define NICE_TO_BT_PRIO(nice) (MAX_RT_PRIO + (nice) + 20 + 40) +#define BT_PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20 - 40) +#define BT_TASK_NICE(p) BT_PRIO_TO_NICE((p)->static_prio) +#endif + +/* BT uses the same nice value range as CFS and also encodes + * its static priority in task_struct's static_prio field + */ +#ifdef CONFIG_BT_SCHED +#define BT_USER_PRIO(p) ((p)-MAX_RT_PRIO-40) +#define BT_TASK_USER_PRIO(p) BT_USER_PRIO((p)->static_prio) +#endif + + +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) + +/* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE 10 + +/* + * Single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +#ifdef CONFIG_BT_SCHED +static inline int bt_policy(int policy) +{ + if (policy == SCHED_BT) + return 1; + return 0; +} + +static inline int task_has_bt_policy(const struct task_struct *p) +{ + return bt_policy(p->policy); +} + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running - (rq)->bt_nr_running) +#else + +#define RQ_CFS_NR_RUNNING(rq) \ + ((rq)->nr_running) +#endif + +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + +static inline int rt_policy(int policy) +{ + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; +} +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || +#ifdef CONFIG_BT_SCHED + bt_policy(policy) || +#endif + rt_policy(policy) || dl_policy(policy); +} + +static inline int task_has_idle_policy(struct task_struct *p) +{ + return idle_policy(p->policy); +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; + unsigned int rt_period_active; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +void __dl_clear_params(struct task_struct *p); + +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ +struct dl_bw { + raw_spinlock_t lock; + u64 bw; + u64 total_bw; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + +static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + +extern void dl_change_utilization(struct task_struct *p, u64 new_bw); +extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); + +#ifdef CONFIG_CGROUP_SCHED + +#include +#include + +struct cfs_rq; +struct rt_rq; +#ifdef CONFIG_BT_SCHED +struct bt_rq; +#endif + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; + u64 runtime; + u64 burst; + u64 buffer; + u64 max_overrun; + u64 runtime_at_period_start; + s64 hierarchical_quota; + + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; + struct hrtimer period_timer; + struct hrtimer slack_timer; + struct list_head throttled_cfs_rq; + + /* Statistics: */ + int nr_periods; + int nr_throttled; + int nr_burst; + u64 throttled_time; + u64 burst_time; +#endif +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH + u64 runtime_bt; + u64 bt_suppress_percent; + struct list_head throttled_bt_rq; + u8 idle_bt; + u8 distribute_running_bt; +#endif +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +/* Task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each CPU */ + struct sched_entity **se; + /* runqueue "owned" by this group on each CPU */ + struct cfs_rq **cfs_rq; + unsigned long shares; + int latency_prio; + +#ifdef CONFIG_SMP + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; +#ifdef CONFIG_HT_ISOLATE + int ht_sensi_type; +#endif +#endif +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + struct sched_bt_entity **bt; + struct bt_rq **bt_rq; + unsigned long bt_shares; + + atomic64_t bt_load_avg; + int offline; + struct mutex offline_mutex; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + u64 cpuquota_aware; + struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +#ifdef CONFIG_BT_GROUP_SCHED +#define ROOT_TASK_GROUP_BT_LOAD NICE_0_LOAD +#define MIN_BT_SHARES (1UL << 1) +#define MAX_BT_SHARES (1UL << 18) +#define CGROUP_BT_PRIORITY 7 +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); +extern void unregister_fair_sched_group(struct task_group *tg); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); + +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, int init); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_BT_SCHED +extern void free_bt_sched_group(struct task_group *tg); +extern int alloc_bt_sched_group(struct task_group *tg, struct task_group *parent); +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +extern void unregister_bt_sched_group(struct task_group *tg); +extern void init_tg_bt_entry(struct task_group *tg, struct bt_rq *bt_rq, + struct sched_bt_entity *se, int cpu, + struct sched_bt_entity *parent); +extern int sched_bt_can_attach(struct task_group *tg, struct task_struct *tsk); +#endif + +#ifdef CONFIG_BT_GROUP_SCHED +#ifdef CONFIG_BT_BANDWIDTH +extern void online_bt_sched_group(struct task_group *tg); +#endif +#endif + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_latency(struct task_group *tg, long latency); + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED +extern int sched_group_set_bt_shares(struct task_group *tg, unsigned long shares); +#endif + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long runnable_weight; + unsigned int nr_running; + unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + + u64 exec_clock; + u64 min_vruntime; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; +#endif + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root_cached tasks_timeline; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; + struct sched_entity *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS load tracking + */ + struct sched_avg avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + struct { + raw_spinlock_t lock ____cacheline_aligned; + int nr; + unsigned long load_avg; + unsigned long util_avg; + unsigned long runnable_sum; + } removed; + +#ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long tg_load_avg_contrib; + long propagate; + long prop_runnable_sum; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. + * This list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + + u64 throttled_clock; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; + int throttled; + int throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_BT_BANDWIDTH +struct bt_bandwidth { + raw_spinlock_t bt_runtime_lock; + ktime_t bt_period; + u64 bt_runtime; + struct hrtimer bt_period_timer; + int timer_active; + unsigned int bt_period_active; +}; +#endif + +#ifdef CONFIG_BT_SCHED +struct bt_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + unsigned long nr_uninterruptible; + + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + struct sched_bt_entity *curr; +#ifdef CONFIG_BT_BANDWIDTH + int bt_throttled; + u64 bt_time; + u64 bt_runtime; + raw_spinlock_t bt_runtime_lock; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; +#endif +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; + u64 throttled_clock_bt; + int throttled; + struct list_head throttled_list; +#endif +#ifdef CONFIG_SMP + /* + * BT Load tracking + */ + struct sched_avg_bt avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; + +#ifdef CONFIG_BT_GROUP_SCHED + unsigned long tg_load_avg_contrib; +#endif /* CONFIG_BT_GROUP_SCHED */ + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif + unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_BT_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this bt_rq is attached */ + + int on_list; + struct list_head leaf_bt_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ +#endif /* CONFIG_BT_GROUP_SCHED */ +}; +#endif + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* RT IPI pull logic requires IRQ_WORK */ +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) +# define HAVE_RT_PUSH_IPI +#endif + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; + unsigned int rr_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; + +#endif /* CONFIG_SMP */ + int rt_queued; + + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct task_group *tg; +#endif +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root_cached root; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision whether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root_cached pushable_dl_tasks_root; +#else + struct dl_bw dl_bw; +#endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; + + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + u64 extra_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) +#else +#define entity_is_task(se) 1 +#endif + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member CPUs from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ + int overload; +#ifdef CONFIG_BT_SCHED + int overload_bt; +#endif + + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + + /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; + + unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain __rcu *pd; + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +extern void init_defrootdomain(void); +extern int sched_init_domains(const struct cpumask *cpu_map); +extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); + +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); +#endif /* CONFIG_UCLAMP_TASK */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t __lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; +#ifdef CONFIG_BT_SCHED + unsigned int bt_nr_running; + u64 bt_blocked_clock; +#endif +#ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; +#endif +#ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +#ifdef CONFIG_HT_ISOLATE + int core_curr_stat; + int ht_sensi_type; +#endif +#endif /* CONFIG_SMP */ + unsigned int nohz_tick_stopped; + atomic_t nohz_flags; +#endif /* CONFIG_NO_HZ_COMMON */ + + unsigned long nr_load_updates; + u64 nr_switches; + +#ifdef CONFIG_BT_SCHED + struct load_weight bt_load; + unsigned long nr_bt_load_updates; +#endif + +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 +#endif + + struct cfs_rq cfs; +#ifdef CONFIG_BT_SCHED + struct bt_rq bt; +#endif + struct rt_rq rt; + struct dl_rq dl; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_BT_GROUP_SCHED + struct list_head leaf_bt_rq_list; +#endif /* CONFIG_BT_GROUP_SCHED */ + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; +#ifdef CONFIG_BT_SCHED + unsigned long last_balance_bt; +#endif + struct mm_struct *prev_mm; + + unsigned int clock_update_flags; + u64 clock; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; + + atomic_t nr_iowait; + +#ifdef CONFIG_MEMBARRIER + int membarrier_state; +#endif + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain __rcu *sd; + + unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; + + unsigned char idle_balance; + + unsigned long misfit_task_load; + + /* For active balancing */ + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + + /* CPU of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; +#ifdef CONFIG_BT_SCHED + struct list_head bt_tasks; +#endif + + struct sched_avg avg_rt; + struct sched_avg avg_dl; +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + struct sched_avg avg_irq; +#endif + u64 idle_bt_stamp; + u64 avg_idle_bt; +#ifdef CONFIG_BT_SCHED + u64 idle_stamp; + u64 avg_idle; +#endif + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; +#ifdef CONFIG_BT_SCHED + long calc_bt_load_active; +#endif + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + call_single_data_t hrtick_csd; +#endif + struct hrtimer hrtick_timer; + ktime_t hrtick_time; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + +#ifdef CONFIG_MEM_QOS + struct list_head exp_reclaim_list; +#endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned char core_forceidle; + unsigned int core_forceidle_seq; +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +struct sched_group; +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(rq, p)) + return true; + } + return false; +} + +extern void queue_core_balance(struct rq *rq); + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +extern unsigned long sched_core_alloc_cookie(void); +extern void sched_core_put_cookie(unsigned long cookie); +extern unsigned long sched_core_get_cookie(unsigned long cookie); +extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); +#else /* !CONFIG_SCHED_CORE */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} + +static inline void queue_core_balance(struct rq *rq) +{ +} +#endif /* CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + local_irq_save(flags); + raw_spin_rq_lock(rq); + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) + +#ifdef CONFIG_SCHED_SMT +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + +//DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +// #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +// #define this_rq() this_cpu_ptr(&runqueues) +// #define task_rq(p) cpu_rq(task_cpu(p)) +// #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +// //#define raw_rq() raw_cpu_ptr(&runqueues) + +#define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +//#define raw_rq() raw_cpu_ptr(&runqueues) +#define raw_rq() raw_cpu_ptr(runqueues) + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} +#endif + +extern void update_rq_clock(struct rq *rq); + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return READ_ONCE(rq->clock); +} + +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + +static inline u64 rq_clock(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_task; +} + +static inline void rq_clock_skip_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif +} + +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irqsave(rq, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock_irq(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock(rq); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock_irq(rq); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +extern void sched_init_numa(void); +extern void sched_domains_numa_masks_set(unsigned int cpu); +extern void sched_domains_numa_masks_clear(unsigned int cpu); +extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); +#else +static inline void sched_init_numa(void) { } +static inline void sched_domains_numa_masks_set(unsigned int cpu) { } +static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } +static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) +{ + return nr_cpu_ids; +} +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_SMP + +static inline void +queue_balance_callback(struct rq *rq, + struct callback_head *head, + void (*func)(struct rq *rq)) +{ + lockdep_assert_rq_held(rq); + + if (unlikely(head->next)) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; +} + +extern void sched_ttwu_pending(void); + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See destroy_sched_domains: call_rcu for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The CPU whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given CPU. + * + * Returns the highest sched_domain of a CPU which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + if (sd->flags & flag) + break; + } + + return sd; +} + +//DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); +DECLARE_PER_CPU(int, sd_llc_id); +//DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); +extern struct static_key_false sched_asym_cpucapacity; + +struct sched_group_capacity { + atomic_t ref; + /* + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity + * for a single CPU. + */ + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ + unsigned long next_update; + int imbalance; /* XXX unrelated to capacity but shared group state */ + +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* Balance mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* CPU of highest priority in group */ + + KABI_RESERVE(1); + KABI_RESERVE(2); + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline struct cpumask *sched_group_span(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * See build_balance_mask(). + */ +static inline struct cpumask *group_balance_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgc->cpumask); +} + +/** + * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. + * @group: The group whose first CPU is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_span(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void dirty_sched_domain_sysctl(int cpu); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void dirty_sched_domain_sysctl(int cpu) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + +#else + +static inline void sched_ttwu_pending(void) { } + +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "autogroup.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) || \ + defined(CONFIG_BT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_BT_GROUP_SCHED + p->bt.bt_rq = tg->bt_rq[cpu]; + p->bt.parent = tg->bt[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfully executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + WRITE_ONCE(p->cpu, cpu); +#else + WRITE_ONCE(task_thread_info(p)->cpu, cpu); +#endif + p->wake_cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG + +/* + * To support run-time toggling of sched features, all the translation units + * (but core.c) reference the sysctl_sched_features defined in core.c. + */ +extern const_debug unsigned int sysctl_sched_features; + +#ifdef CONFIG_JUMP_LABEL +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_key_##enabled(key); \ +} + +#include "features.h" +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) + +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ + +/* + * Each translation unit has its own copy of sysctl_sched_features to allow + * constants propagation at compile time and compiler optimization based on + * features default. + */ +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | +static const_debug __maybe_unused unsigned int sysctl_sched_features = +#include "features.h" + 0; +#undef SCHED_FEAT + +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* SCHED_DEBUG */ + +extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifdef CONFIG_BT_BANDWIDTH +static inline u64 global_bt_period(void) +{ + return (u64)sysctl_sched_bt_period * NSEC_PER_USEC; +} +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; +} + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* Child wakeup after fork */ +#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +/* WF_PIPE used for perfromance, if changed, also change WF_PIPE in pipe.c */ +#define WF_PIPE 0x8 /* see above comment */ + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; +extern const int sched_latency_to_weight[40]; + +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 + +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x40 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + +#define RETRY_TASK ((void *)-1UL) + +struct sched_class { + const struct sched_class *next; + +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + + /* + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). + * + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. + */ + struct task_struct * (*pick_next_task)(struct rq *rq, + struct task_struct *prev, + struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + +#ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + struct task_struct * (*pick_task)(struct rq *rq); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + void (*task_fork)(struct task_struct *p); + void (*task_dead)(struct task_struct *p); + + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ + void (*switched_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval)(struct rq *rq, + struct task_struct *task); + + void (*update_curr)(struct rq *rq); + +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_BT_GROUP_SCHED) + void (*task_change_group)(struct task_struct *p, int type); +#endif + + KABI_RESERVE(1); + KABI_RESERVE(2); + KABI_RESERVE(3); + KABI_RESERVE(4); +#ifdef CONFIG_ARM64 +} ____cacheline_aligned; +#else +}; +#endif + +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); +} + +static inline void set_next_task(struct rq *rq, struct task_struct *next) +{ + next->sched_class->set_next_task(rq, next, false); +} + +#ifdef CONFIG_SMP +#define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + +#define for_each_class(class) \ + for_class_range(class, sched_class_highest, NULL) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; +#ifdef CONFIG_BT_SCHED +extern const struct sched_class bt_sched_class; +#endif + +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} + +#ifdef CONFIG_SMP + +extern void update_group_capacity(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq); +#ifdef CONFIG_BT_SCHED +extern void trigger_load_balance_bt(struct rq *rq); +#endif + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + +#if defined(CONFIG_BT_GROUP_SCHED) +extern void idle_enter_bt(struct rq *this_rq); +extern void idle_exit_bt(struct rq *this_rq); +#else +static inline void idle_enter_bt(struct rq *this_rq) {} +static inline void idle_exit_bt(struct rq *this_rq) {} +#endif + +#endif + +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + SCHED_WARN_ON(!rcu_read_lock_held()); + + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + +extern void schedule_idle(void); + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); + +extern void init_sched_dl_class(void); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); +#ifdef CONFIG_BT_SCHED +extern void init_sched_bt_class(void); +extern void update_idle_cpu_bt_load(struct rq *this_rq); +extern void init_bt_entity_runnable_average(struct sched_bt_entity *se); +extern void post_init_bt_entity_util_avg(struct sched_bt_entity *se); +#endif + +extern void reweight_task(struct task_struct *p, int prio); + +extern void resched_curr(struct rq *rq); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +#ifdef CONFIG_BT_BANDWIDTH +extern struct bt_bandwidth def_bt_bandwidth; +extern void init_bt_bandwidth(struct bt_bandwidth *bt_b, u64 period, u64 runtime); +#endif + +#ifdef CONFIG_BT_SHARE_CFS_BANDWIDTH +extern void do_sched_bt_slack_timer(struct cfs_bandwidth *cfs_b); +extern int do_sched_bt_period_timer_share(struct cfs_bandwidth *cfs_b, unsigned long flags); +extern void __refill_cfs_bandwidth_runtime_bt(struct cfs_bandwidth *cfs_b); +extern void unthrottle_bt_rq_share(struct bt_rq *bt_rq); +extern void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b); +extern int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire); +extern bool cfs_bandwidth_used(void); +#endif + +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) +unsigned long to_ratio(u64 period, u64 runtime); + +extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline int sched_tick_offload_init(void) { return 0; } +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + unsigned prev_nr = RQ_CFS_NR_RUNNING(rq); + + rq->nr_running += count; + +#ifdef CONFIG_SMP + if (prev_nr < 2 && RQ_CFS_NR_RUNNING(rq) >= 2) { + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); + } + +#ifdef CONFIG_BT_SCHED + if (rq->bt_nr_running >= 2) { + if (!READ_ONCE(rq->rd->overload_bt)) + WRITE_ONCE(rq->rd->overload_bt, 1); + } +#endif +#endif + + sched_update_tick_dependency(rq); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); +} + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifdef CONFIG_SMP + +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); +#ifdef CONFIG_PREEMPTION + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower CPU-ids and will + * grant the double lock to lower CPUs over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (__rq_lockp(this_rq) == __rq_lockp(busiest)) + return 0; + + if (likely(raw_spin_rq_trylock(busiest))) + return 0; + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + return 0; + } + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; +} +#endif /* CONFIG_PREEMPTION */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + lockdep_assert_irqs_disabled(); + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); +} + +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + raw_spin_lock(l1); + raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); + else + __release(rq2->lock); + raw_spin_rq_unlock(rq1); +} + +extern void set_rq_online (struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern bool sched_smp_initialized; + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_rq_lock(rq1); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_rq_unlock(rq1); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_SCHED_DEBUG +extern bool sched_debug_enabled; + +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); +#ifdef CONFIG_NUMA_BALANCING +extern void +show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void +print_numa_stats(struct seq_file *m, int node, unsigned long tsf, + unsigned long tpf, unsigned long gsf, unsigned long gpf); +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); +#ifdef CONFIG_BT_SCHED +extern void init_bt_rq(struct bt_rq *bt_rq); +#endif + +#ifdef CONFIG_SCHED_DEBUG +#ifdef CONFIG_BT_SCHED +extern void print_bt_stats(struct seq_file *m, int cpu); +extern void print_bt_rq(struct seq_file *m, int cpu, struct bt_rq *bt_rq); +#endif +#endif + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +#define NOHZ_BALANCE_KICK_BIT 0 +#define NOHZ_STATS_KICK_BIT 1 + +#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) + +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(struct rq *rq); +#else +static inline void nohz_balance_exit_idle(struct rq *rq) { } +#endif + + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 total; + u64 tick_delta; + u64 irq_start_time; + struct u64_stats_sync sync; +}; + +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); + +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ +static inline u64 irq_time_read(int cpu) +{ + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; + + do { + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->total; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); + + return total; +} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). + * + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); + if (data) + data->func(data, rq_clock(rq), flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef CONFIG_UCLAMP_TASK +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + unsigned int min_util; + unsigned int max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + +#ifdef arch_scale_freq_capacity +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif +#else +# define arch_scale_freq_invariant() false +#endif + +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; +} + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) +{ + return 0; +} +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + +#define perf_domain_span(pd) NULL +static inline bool sched_energy_enabled(void) { return false; } + +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#ifdef CONFIG_MEMBARRIER +/* + * The scheduler provides memory barriers required by membarrier between: + * - prior user-space memory accesses and store to rq->membarrier_state, + * - store to rq->membarrier_state and following user-space memory accesses. + * In the same way it provides those guarantees around store to rq->curr. + */ +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + int membarrier_state; + + if (prev_mm == next_mm) + return; + + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} +#else +static inline void membarrier_switch_mm(struct rq *rq, + struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ +} +#endif diff --git a/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/stats.h b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/stats.h new file mode 100644 index 0000000000000000000000000000000000000000..398035545a6a8c4263d2025f4431cfc322bff4c5 --- /dev/null +++ b/ops/os_stat/os_stat/include_tk4_arm/kernel/sched/stats.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} + +static inline void update_schedstat_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +#define schedstat_update_avg(var, val) do { update_schedstat_avg(var, val); } while (0) + +#else /* !CONFIG_SCHEDSTATS: */ +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 +# define schedstat_update_avg(var, val) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->in_memstall) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (static_branch_likely(&psi_disabled)) + return; + + if (!sleep) { + if (p->in_memstall) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (static_branch_likely(&psi_disabled)) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || p->in_memstall)) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->in_memstall) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (static_branch_likely(&psi_disabled)) + return; + + if (unlikely(rq->curr->in_memstall)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + +#ifdef CONFIG_SCHED_INFO +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. + */ +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (sched_info_on()) { + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + } + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(rq, delta); +} + +/* + * Called when a task finally hits the CPU. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct rq *rq, struct task_struct *t, struct task_struct *prev) +{ + unsigned long long now = rq_clock(rq), delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + rq_sched_info_arrive(rq, delta); +#ifdef CONFIG_CGROUP_SLI + sli_schedlat_rundelay(t, prev, delta); +#endif +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +{ + if (sched_info_on()) { + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); + } +} + +/* + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct rq *rq, struct task_struct *t) +{ + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; + + rq_sched_info_depart(rq, delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(rq, t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + /* + * prev now departs the CPU. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(rq, prev); + + if (next != rq->idle) + sched_info_arrive(rq, next, prev); +} + +static inline void +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +{ + if (sched_info_on()) + __sched_info_switch(rq, prev, next); +} + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_queued(rq, t) do { } while (0) +# define sched_info_reset_dequeued(t) do { } while (0) +# define sched_info_dequeued(rq, t) do { } while (0) +# define sched_info_depart(rq, t) do { } while (0) +# define sched_info_arrive(rq, t, prev) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) +#endif /* CONFIG_SCHED_INFO */ diff --git a/ops/os_stat/os_stat/io_scene/io_bfq_scene.c b/ops/os_stat/os_stat/io_scene/io_bfq_scene.c new file mode 100644 index 0000000000000000000000000000000000000000..ff5d720c8a73c8479003dfbff06defef210ad1b2 --- /dev/null +++ b/ops/os_stat/os_stat/io_scene/io_bfq_scene.c @@ -0,0 +1,467 @@ +/* + * debug kernel problem + * aurelianliu@tencent.com + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../scene_layer.h" +#include "../syms.h" +#include "../data_aware.h" + +/* + * see bio insert queue -> req dispatch, + * see req dispatch -> bio endio + * see ata commond issue -> ata cmd complete + * see process to queue work, and work wakeup + */ +static char functions[][32] = { + "bfq_insert_requests" + "bfq_dispatch_request", + "ata_qc_complete", + "ata_qc_complete_internal", + "ata_qc_issue", + "blk_mq_run_work_fn", + "scsi_mq_get_budget", + "mod_delayed_work_on", +}; + +unsigned long stat_recored_num = 1024; +char *recored_per_disk_cmd[70]; +char *recored_per_sector[70]; +unsigned long *recored_time[70]; +unsigned long *recored_req[70]; + +void stat__blk_mq_delay_run_hw_queue_before(struct blk_mq_hw_ctx *hctx, bool async, + unsigned long msecs) +{ + if (hctx) + hctx->run_work.kabi_reserved1 = 0x55aa; +} +bool stat_mod_delayed_work_on_after(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay, int ret) +{ + if (dwork && dwork->work.kabi_reserved2 == 0) { + if (1 || dwork->kabi_reserved1 == 0x55aa) { + dwork->work.kabi_reserved1 = current->pid; + dwork->work.kabi_reserved2 = sched_clock(); + } + } + + return true; +} + +bool stat__bfq_insert_request_before(unsigned long arg1, unsigned long arg2) +{ + struct request *rq = (struct request *)arg2; + + if (rq) { + struct bio *bio, *bio_tmp; + + if (rq->bio) { + unsigned long time1 = sched_clock(); + for (bio_tmp = rq->bio; bio_tmp; bio_tmp = bio_tmp->bi_next) + bio_tmp->kabi_reserved1 = time1; + } + } + + return true; +} +//bio_endio +static int print_bio_time; +void stat_bio_endio_before(struct bio *bio, int name) +{ + unsigned long time1 = sched_clock(); + char buf[128]; + + if (!strstr(get_one_func_name(name), "bio_endio")) + return; + if (bio->kabi_reserved2 > 0 && time1 - bio->kabi_reserved2 > 500000000) { + sprintf(buf, "%16s, delta:%8ld, sector:%lx, cpu:%d\n", + current->comm, time1 - bio->kabi_reserved2, + bio->bi_iter.bi_sector, + smp_processor_id()); + store_info(0, 0, 0, current->comm, buf, ":end_above"); + } +} + +void stat_blk_mq_run_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long time; + + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + if (hctx) { + struct request_queue *q = hctx->queue; + char buf[128]; + + time = sched_clock(); + if (q && time - hctx->run_work.work.kabi_reserved2 > 10000000 + && hctx->run_work.work.kabi_reserved2 != 0 + && time - hctx->run_work.work.kabi_reserved2 < 10000000000) { + sprintf(buf, "state:%ld, quiesced:%d, time:%ld, waker:%d, waked proc:%s, %d, %lx, delta:%ld\n", + hctx->state, blk_queue_quiesced(q), + sched_clock(), hctx->run_work.work.kabi_reserved1, current->comm, current->pid, &hctx->run_work, + time - hctx->run_work.work.kabi_reserved2); + store_info(0, 0, 0, current->comm, buf, "run work"); + } + } +} +void stat_scsi_mq_get_budget_after(struct blk_mq_hw_ctx *hctx, unsigned long ret) +{ + if (!ret) { + char buf[128]; + sprintf(buf, "budget?%ld, time:%ld, proc:%s, %d\n", ret, + sched_clock(), current->comm, current->pid); + store_info(0, 0, 0, current->comm, buf, "get_budget"); + } +} + +static char per_disk_name[70][8] = {0}; +unsigned long stat_recored_name_cycle[70]; +void test_bfq_insert_requests_before(unsigned long arg1, unsigned long arg2) +{ + struct list_head *list = (struct list_head *)arg2; + struct request *rq; + struct bio *bio_tmp; + unsigned long time1 = sched_clock(); + struct scsi_request *req; + char buf[128]; + + if (!list) + return; + if (!list_empty(list)) { + list_for_each_entry(rq, list, queuelist) { + if (!rq) + return; + + if (rq->bio) { + int i, index; + if (sysctl_enable_debug + && !strstr(rq->bio->bi_disk->disk_name, printk_name_last)) + continue; + for(i = 0; i < 70; i++){ + if (strstr(per_disk_name[i], rq->bio->bi_disk->disk_name)) + break; + if (per_disk_name[i][0] == '\0') { + strcpy(per_disk_name[i], rq->bio->bi_disk->disk_name); + break; + } + } + if (i >= 70) + return; + index = stat_recored_name_cycle[i]++; + for (bio_tmp = rq->bio; bio_tmp; bio_tmp = bio_tmp->bi_next) { + bio_tmp->kabi_reserved1 = time1; + bio_tmp->kabi_reserved2 = current->pid; + sprintf(buf, " sector:%lx, %s, %d, %s, cpu:%d", bio_tmp->bi_iter.bi_sector, current->comm, current->pid, + bio_tmp->bi_disk->disk_name, smp_processor_id()); + store_info(0, 0, 0, current->comm, buf, "insert bio"); + } + } + } + } +} + +static int recore_scsi_cmd = 1; +static unsigned long per_disk_addr[70]; +unsigned long stat_recored_num_cycle[70]; +void stat_ata_qc_complete_internal(struct ata_queued_cmd *qc) +{ + int i; + unsigned long *time_ptr; + char buf[128]; + + if (!qc || !qc->scsicmd || !qc->scsicmd->req.cmd) + return; + + for (i = 0; i < 70; i++) { + if (per_disk_addr[i] == qc) + break; + } + if (i >= 70) + return; + + if (recored_per_disk_cmd[i]) { + int i, j, f; + char *ptr; + if (recored_per_disk_cmd[i]) { + f = stat_recored_num_cycle[i]; + for (j= 0; j < stat_recored_num; j++) { + ptr = recored_per_disk_cmd[i] + f % stat_recored_num; + time_ptr = recored_time[i] + f % stat_recored_num; + sprintf(buf, "ata:%s, delta:%lx, req:%lx", ptr, sched_clock() - *time_ptr, &qc->scsicmd->req); + store_info(0, 0, 0, current->comm, buf, "ata complate"); + f++; + } + } + } + + +} + +void stat_ata_qc_issue_before(struct ata_queued_cmd *qc) +{ + int i, j, m; + unsigned long dev_ptr = 0, *time_ptr, *req_ptr; + char *ptr; + struct request *rq = NULL;; + char buf[128]; + + if (recore_scsi_cmd == 0) + return; + + if (!qc || !qc->scsicmd || !qc->scsicmd->req.cmd) + return; + + for (i = 0; i < 70; i++) { + if (per_disk_addr[i] == qc) + break; + if (per_disk_addr[i] == 0) + break; + } + + if (i < 70) + per_disk_addr[i] = qc; + + rq = blk_mq_rq_from_pdu(qc->scsicmd); + + if (recored_per_disk_cmd[i]) { + j = stat_recored_num_cycle[i]++; + ptr = recored_per_disk_cmd[i] + (j % stat_recored_num) * 128; + for (m = 0; m < qc->scsicmd->req.cmd_len && m < 1024; m++) + ptr[m] = qc->scsicmd->req.__cmd[m]; + time_ptr = recored_time[i] + j % stat_recored_num; + *time_ptr = sched_clock(); + req_ptr = recored_req[i] + j % stat_recored_num; + *req_ptr = &qc->scsicmd->req; + sprintf(buf, "%ld, req:%lx, %lx, qc:%lx, %d, %d, %lx, %lx, %lx, %lx", + *time_ptr, *req_ptr, &qc->scsicmd->req, qc, i, j, req_ptr, time_ptr, + recored_time[i] + j % stat_recored_num, + recored_req[i] + j % stat_recored_num); + store_info(0, 0, 0, current->comm, buf, "ata issue"); + if (rq && rq->bio) { + sprintf(buf, "qc:%lx, req:%lx :%x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, proc:%s, %d, origin:%d, sector:%lx", + qc, *req_ptr, + qc->scsicmd->req.__cmd[0], qc->scsicmd->req.__cmd[1], qc->scsicmd->req.__cmd[2], qc->scsicmd->req.__cmd[3], + qc->scsicmd->req.__cmd[4], qc->scsicmd->req.__cmd[5], qc->scsicmd->req.__cmd[6], qc->scsicmd->req.__cmd[7], + qc->scsicmd->req.__cmd[8], qc->scsicmd->req.__cmd[9], qc->scsicmd->req.__cmd[10], qc->scsicmd->req.__cmd[11], + qc->scsicmd->req.__cmd[12], qc->scsicmd->req.__cmd[13], qc->scsicmd->req.__cmd[14], qc->scsicmd->req.__cmd[15], + current->comm, current->pid, rq->bio->kabi_reserved2, rq->bio->bi_iter.bi_sector); + store_info(0, 0, 0, current->comm, buf, "ata issue"); + } + else + pr_info("qc:%lx, req:%lx :%x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, proc:%s, %d", qc, *req_ptr, + qc->scsicmd->req.__cmd[0], qc->scsicmd->req.__cmd[1], qc->scsicmd->req.__cmd[2], qc->scsicmd->req.__cmd[3], + qc->scsicmd->req.__cmd[4], qc->scsicmd->req.__cmd[5], qc->scsicmd->req.__cmd[6], qc->scsicmd->req.__cmd[7], + qc->scsicmd->req.__cmd[8], qc->scsicmd->req.__cmd[9], qc->scsicmd->req.__cmd[10], qc->scsicmd->req.__cmd[11], + qc->scsicmd->req.__cmd[12], qc->scsicmd->req.__cmd[13], qc->scsicmd->req.__cmd[14], qc->scsicmd->req.__cmd[15], + current->comm, current->pid); + } +} + +void stat__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx, unsigned long ret) +{ + struct request *rq = (struct request *)ret; + struct bio *bio, *bio_tmp; + unsigned long time1 = sched_clock(); + int dev_ptr = 0; + struct scsi_request *req; + char buf[128]; + + if (!rq) + return; + + bio = rq->bio; + if (!bio) + return; + + if (!strstr(rq->rq_disk->disk_name, printk_name_last)) + return; + + for (bio_tmp = bio; bio_tmp; bio_tmp = bio_tmp->bi_next) { + if (time1 - bio_tmp->kabi_reserved1 < 10000000000 && time1 - bio_tmp->kabi_reserved1 > 1500000000 && rq->rq_disk) { + req = scsi_req(rq); + if (recore_scsi_cmd == 1 && !strstr(current->comm, "hdparm") && !strstr(current->comm, "smartctl")) { + sprintf(buf, "dispatch: %16s, delta:%8ld, sector:%lx, len:%d, disk:%s, source pid:%d, exec delta:%ld, cpu:%d, %pF, req:%lx\n", + current->comm, time1 - bio_tmp->kabi_reserved1, + bio_tmp->bi_iter.bi_sector, rq->__data_len, rq->rq_disk->disk_name, + bio_tmp->kabi_reserved2, + time1 - hctx->run_work.work.kabi_reserved2, + smp_processor_id(), bio_tmp->bi_end_io, req); + store_info(0, 0, 0, current->comm, buf, "dispatch req above"); + } + + recore_scsi_cmd = 0; + if (1) { + int i, j, f; + char *ptr; + unsigned long *time_ptr, *req_ptr, time; + for (i = 0; i < 70; i++) { + if (per_disk_addr[i] == 0) + continue; + if (recored_per_disk_cmd[i]) { + f = stat_recored_num_cycle[i]; + for (j= 0; j < stat_recored_num; j++) { + ptr = recored_per_disk_cmd[i] + (f % stat_recored_num) * 128; + time_ptr = recored_time[i] + f % stat_recored_num; + req_ptr = recored_req[i] + f % stat_recored_num; + time = (sched_clock() - *time_ptr) / 1000000; + if (*req_ptr != 0 && time < 2000) { + sprintf(buf, "%d, %d, %d, qc:%lx, req:%lx, %lx, :%x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, %x, delta:%ld, %lx, %lx, %lx, %lx", + i, j, f % stat_recored_num, per_disk_addr[i], req, *req_ptr, + ptr[0], ptr[1], ptr[2], ptr[3], + ptr[4], ptr[5], ptr[6], ptr[7], + ptr[8], ptr[9], ptr[10], ptr[11], + ptr[12], ptr[13], ptr[14], ptr[15], + time, + time_ptr,req_ptr, + recored_time[i] + j % stat_recored_num, + recored_req[i] + j % stat_recored_num + ); + store_info(0, 0, 0, current->comm, buf, "dispatch cmd"); + } + f++; + } + } + } + + for(i = 0; i < 70; i++){ + if (!strstr(per_disk_name[i], rq->bio->bi_disk->disk_name)) + continue; + f = stat_recored_name_cycle[i]; + for (j = 0; i < stat_recored_num; j++) { + ptr = recored_per_sector[i] + (f % stat_recored_num) * 128; + sprintf(buf, "%d, %s", f % stat_recored_num, ptr); + store_info(0, 0, 0, current->comm, buf, "dispatch sector"); + f++; + } + break; + } + } + } else { + sprintf(buf, "less, rq ---%16s, delta:%8ld, sector:%lx, disk:%s, cpu:%d\n", + current->comm, time1 - bio_tmp->kabi_reserved1, + bio_tmp->bi_iter.bi_sector, rq->rq_disk->disk_name, + smp_processor_id()); + store_info(0, 0, 0, current->comm, buf, "less dispatch"); + } + hctx->run_work.work.kabi_reserved2 = 0; + } +} + +void io_scene_bfq_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + + if (strstr(ftrace_hook_name, functions[0])) { + test_bfq_insert_requests_before(arg1, arg2); + } +} + +void io_scene_bfq_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (strstr(get_one_func_name(1), functions[1])) { + stat__bfq_dispatch_request(arg1, ret); + /*add more debug info*/ + } +} + +unsigned long io_scene_bfq_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(2), functions[2])) { + stat_ata_qc_complete_internal(arg1); + /*add more debug info*/ + } + + return 0; +} + +unsigned long io_scene_bfq_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(3), functions[3])) { + stat_ata_qc_complete_internal(arg1); + /*add more debug info*/ + } + + return 0; +} + +unsigned long io_scene_bfq_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(4), functions[4])) + stat_ata_qc_issue_before(arg1); + + return 0; +} + +unsigned long io_scene_bfq_before_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(5), functions[5])) + stat_blk_mq_run_work_fn(arg1); + + return 0; +} + +void io_scene_bfq_after_6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (strstr(get_one_func_name(6), functions[6])) + stat_scsi_mq_get_budget_after(arg1, ret); +} + +void io_scene_bfq_after_7(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (strstr(get_one_func_name(7), functions[6])) + stat_mod_delayed_work_on_after(arg1, arg2, arg3, arg4, ret); +} + +void io_scene_bfq_init(void) +{ + int i; + + recored_per_disk_cmd[0] = vzalloc(70 * stat_recored_num * 128 * sizeof(char)); + if (!recored_per_disk_cmd[0]) + return -EPERM; + + for(i = 1; i < 70; i++) { + recored_per_disk_cmd[i] = recored_per_disk_cmd[i - 1] + stat_recored_num * 128; + } + + recored_per_sector[0] = vzalloc(70 * stat_recored_num * 128 * sizeof(char)); + if (!recored_per_sector[0]) + return -EPERM; + + for(i = 1; i < 70; i++) { + recored_per_sector[i] = recored_per_sector[i - 1] + stat_recored_num * 128; + } + + recored_time[0] = (unsigned long *)vzalloc(70 * stat_recored_num * sizeof(unsigned long)); + if (!recored_time[0]) + return -EPERM; + + for(i = 1; i < 70; i++) { + recored_time[i] = recored_time[i - 1] + stat_recored_num; + } + + recored_req[0] = (unsigned long *)vzalloc(70 * stat_recored_num * sizeof(unsigned long)); + if (!recored_req[0]) + return -EPERM; + + for(i = 1; i < 70; i++) { + recored_req[i] = recored_req[i - 1] + stat_recored_num; + } +} + + diff --git a/ops/os_stat/os_stat/io_scene/io_bfq_scene.h b/ops/os_stat/os_stat/io_scene/io_bfq_scene.h new file mode 100644 index 0000000000000000000000000000000000000000..86a82edd52f590fb0ce7bf42da10d5175ad3a2c8 --- /dev/null +++ b/ops/os_stat/os_stat/io_scene/io_bfq_scene.h @@ -0,0 +1,42 @@ +/* + * debug kernel problem + * aurelianliu@tencent.com + */ +#include "../version.h" +#ifndef TK2 +extern void io_scene_bfq_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6); +extern void io_scene_bfq_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test); +extern unsigned long io_scene_bfq_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern unsigned long io_scene_bfq_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern unsigned long io_scene_bfq_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern unsigned long io_scene_bfq_before_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern void io_scene_bfq_after_6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test); +extern void io_scene_bfq_after_7(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test); +extern void io_scene_bfq_init(void); +#else +static void io_scene_bfq_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6){} +static void io_scene_bfq_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test){} +static unsigned long io_scene_bfq_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6){return 0;} +static unsigned long io_scene_bfq_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6){return 0;} +static unsigned long io_scene_bfq_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6){return 0;} +static unsigned long io_scene_bfq_before_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6){return 0;} +static void io_scene_bfq_after_6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test){} +static void io_scene_bfq_after_7(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test){} +static void io_scene_bfq_init(void){} +#endif diff --git a/ops/os_stat/os_stat/io_scene/io_scene.c b/ops/os_stat/os_stat/io_scene/io_scene.c new file mode 100644 index 0000000000000000000000000000000000000000..ef1b857b02773050199ea248c8fb165f8cf994fb --- /dev/null +++ b/ops/os_stat/os_stat/io_scene/io_scene.c @@ -0,0 +1,825 @@ +/* + * debug kernel problem + * aurelianliu@tencent.com + */ +#include +#include +#include +#include +#include +#include +#include "../version.h" +#ifdef TK2 +#include +#include +#endif +#include "../scene_layer.h" +#include "../syms.h" +#include "../data_aware.h" +#include "io_bfq_scene.h" + +enum bio_pos { + BIO_SUBMIT, + BIO_DISPATCH, + BIO_ENDBIO +}; + +static char functions[][32] = { + "ext4_file_read_iter", + "ext4_da_reserve_space", + "blk_mq_get_driver_tag", + "ext4_mb_new_blocks", + "ext4_claim_free_clusters", + "bio_endio", + "ext4_da_release_space", + "ext4_rereserve_cluster", + "ext4_clear_inode", + "vfs_read", + "blk_account_io_start", + "submit_bio", + "queue_io", + "mark_buffer_dirty", + "generic_update_time", + "ext4_da_update_reserve_space", + "worker_enter_idle", + "wake_up_process", + "requeue_inode", + "move_expired_inodes", + "ext4_nonda_switch" +}; + +#define RLIMIT_NOFILE 7 +static unsigned long stat_block_add; +static unsigned long stat_block_dec; +static unsigned long stat_block_mb_new; +static unsigned long inode_res1; +static unsigned long inode_hash1; +static unsigned long inode_res2; +static unsigned long inode_hash2; +static unsigned long hash(struct super_block *sb, unsigned long hashval); +static int print_num; +static bool print_once = true; + +int adjust_ext4_nonda_switch(struct super_block *sb) +{ + s64 free_clusters, dirty_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + context_check_start(); + if (sb && sb->s_bdev && sb->s_bdev->bd_disk) { + if (!sysctl_enable_debug || sysctl_enable_debug + && strstr(sb->s_bdev->bd_disk->disk_name, printk_name_last)) { + store_info(current->pid, free_clusters, dirty_clusters, sb->s_bdev->bd_disk->disk_name, "write", "write"); + } + } + context_check_end(); + /* + * Start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) + try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); + + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { + /* + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark + */ + return 1; + } + return 0; +} + +//blk_mq_get_driver_tag, for dispatch list: submit_bio->dispatch->endio_bio +//ext4_file_read_iter for s_dirtyclusters_counter +unsigned long io_scene_before_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret = 0; + struct page *page = (struct page *)arg1; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + struct request *rq; + + if (strstr(get_one_func_name(1), functions[0])) { + s64 free_clusters, dirty_clusters; + struct kiocb *iocb = (struct kiocb *)arg1; + struct inode *inode = NULL; + struct super_block *sb = NULL; + struct ext4_sb_info *sbi = NULL; + + if (iocb) + inode = file_inode(iocb->ki_filp); + if (!inode) + goto next; + sb = inode->i_sb; + if (!sb) + goto next; + sbi = EXT4_SB(sb); + if (!sbi) + goto next; + + if (sb && sb->s_bdev && sb->s_bdev->bd_disk) { + if (!sysctl_enable_debug || (sysctl_enable_debug + && strstr(sb->s_bdev->bd_disk->disk_name, printk_name_last))) { + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + store_info(current->pid, free_clusters, dirty_clusters, sb->s_bdev->bd_disk->disk_name, "read", "read"); + } + } + + goto next; + } + if (strstr(get_one_func_name(1), functions[1])) { + struct inode *inode = (struct inode *)arg1; + struct hlist_head *head; + struct ext4_inode_info *ei; + struct super_block *sb; + struct hlist_head *inode_hashtable_tmp = (struct hlist_head *)p_test_var_1; + + sb = inode->i_sb; + ei = EXT4_I(inode); + if (inode_res1 == 0 && ei && ei->i_reserved_data_blocks) { + inode_hash1 = hash(sb, inode->i_ino); + inode_res1 = ei->i_reserved_data_blocks; + } + else if (inode_res2 == 0 && ei && ei->i_reserved_data_blocks) { + inode_hash2 = hash(sb, inode->i_ino); + inode_res2 = ei->i_reserved_data_blocks; + } + goto next; + } + if (sysctl_enable_debug == 2 && strstr(get_one_func_name(1), functions[2])) { + rq = (struct request *)arg1; +#if !defined TK3 && !defined TK2 + if (rq && rq->bio) + rq->bio->kabi_reserved1 = 1 << BIO_DISPATCH; +#endif + } + +next: + return ret; +} + +/* + *ext4_claim_free_clusters + *ext4_mb_new_blocks +*/ +void io_scene_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + struct page *page = (struct page *)arg1; + struct inode *inode = NULL; + + if (strstr(get_one_func_name(1), functions[0])) + goto out; + + if (strstr(get_one_func_name(1), functions[3])) { + int i; + struct ext4_allocation_request *ar = (struct ext4_allocation_request *)arg2; + struct super_block *s_sb = NULL; + struct ext4_sb_info *sbi; + + if (!ar || !ar->inode || (ar->flags & EXT4_MB_DELALLOC_RESERVED != 0)) + goto out; + s_sb = ar->inode->i_sb; + if (!s_sb) + goto out; + sbi = EXT4_SB(s_sb); + if (sbi && s_sb && s_sb->s_bdev && s_sb->s_bdev->bd_disk) { + if (sysctl_enable_debug + && strstr(s_sb->s_bdev->bd_disk->disk_name, printk_name_last)) { + + stat_block_mb_new += ar->len; + store_info(current->pid, ar->len, stat_block_mb_new, current->comm, + s_sb->s_bdev->bd_disk->disk_name, "add:1"); + } + } + match_index(arg2, ar->len); + } + if (strstr(get_one_func_name(1), functions[4]) && test == 0) { + int i; + struct ext4_sb_info *sbi = (struct ext4_sb_info *)arg1; + struct super_block *s_sb = NULL; + + if (sbi) + s_sb = sbi->s_sb; + if (!s_sb) + goto out; + if (s_sb && s_sb->s_bdev && s_sb->s_bdev->bd_disk) { + if (sysctl_enable_debug + && strstr(s_sb->s_bdev->bd_disk->disk_name, printk_name_last)) { + struct ext4_sb_info *sbi = (struct ext4_sb_info *)arg1; + + stat_block_add += arg2; + store_info(current->pid, arg2, stat_block_add, current->comm, + s_sb->s_bdev->bd_disk->disk_name, "add:1"); + } + } + match_index(arg2, 1); + } + + io_scene_bfq_after_1(arg1, arg2, arg3, arg4, arg5, ret, test); + +out: + return; +} + +/* + * endio_bio: + * submit_bio->dispatch->endio_bio + */ +unsigned long io_scene_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret = 0; + struct page *page = (struct page *)arg1; + struct address_space *mapping = (struct address_space *)arg2; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + struct bio *bio; + + if (sysctl_enable_debug == 2 && strstr(get_one_func_name(2), functions[5])) { + bio = (struct bio *)arg1; +#if !defined TK3 && !defined TK2 + if (bio) + bio->kabi_reserved1 = 1 << BIO_ENDBIO; +#endif + goto next; + } + if (strstr(get_one_func_name(2), functions[6])) { + struct inode *inode = (struct inode *)arg1; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + + ei = EXT4_I(inode); + sbi = EXT4_SB(inode->i_sb); + + stat_block_dec += arg2; + store_info(current->pid, arg2, stat_block_dec, current->comm, + inode->i_sb->s_bdev->bd_disk->disk_name, "dec:2"); + match_dec_index(1, false, 2); + } + io_scene_bfq_before_2(arg1, arg2, arg3, arg4, arg5, arg6); +next: + return ret; +} +void io_scene_after_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (!test) + return; + + return; +} +unsigned long io_scene_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + io_scene_bfq_before_3(arg1, arg2, arg3, arg4, arg5, arg6); + return 0; +} +void io_scene_after_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + struct page *page = (struct page *)arg1; + struct inode *inode = NULL; + + if (strstr(get_one_func_name(3), functions[1])) { + struct inode *inode = (struct inode *)arg1; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + stat_block_add += 1; + store_info(current->pid, 1, stat_block_add, current->comm, inode->i_sb->s_bdev->bd_disk->disk_name, "add:2"); + match_index(1, 2); + goto out; + } + + if (!test) + goto out; + +out: + return; +} + +/* ext4_rereserve_cluster */ +unsigned long io_scene_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret = 0; + struct inode *inode = (struct inode *)arg2; + struct page *page = (struct page *)arg2; + if (strstr(get_one_func_name(4), functions[7])) { + struct inode *inode = (struct inode *)arg1; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + stat_block_add += 1; + store_info(current->pid, 1, stat_block_add, current->comm, inode->i_sb->s_bdev->bd_disk->disk_name, "add:3"); + match_index(1, 3); + goto next; + } + io_scene_bfq_before_4(arg1, arg2, arg3, arg4, arg5, arg6); +next: + return ret; +} + +void io_scene_after_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (!test) + return; + + return; +} + +/* ext4_clear_inode */ +unsigned long io_scene_before_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + + if (strstr(get_one_func_name(4), functions[8])) + match_dec_index(1, false, 3); + + io_scene_bfq_before_5(arg1, arg2, arg3, arg4, arg5, arg6); + + return ret; +}; + +void io_scene_after_5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + +}; + +/* ext4_evict_inode */ +unsigned long io_scene_before_6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long ret; + if (strstr(get_one_func_name(4), functions[8])) + match_dec_index(1, false, 4); + + return ret; +}; + +void io_scene_after_6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + +}; + + +bool disk_check(struct request *rq) +{ +#ifdef TK5 + if (sysctl_enable_debug && rq->bio && rq->bio->bi_bdev && !strstr(rq->bio->bi_bdev->bd_disk->disk_name, printk_name_last)) +#else + if (sysctl_enable_debug && rq->rq_disk && !strstr(rq->rq_disk->disk_name, printk_name_last)) +#endif + return false; + return true; +} + +char *get_disk_name(struct request *rq) +{ +#ifdef TK5 + if (rq->bio && rq->bio->bi_bdev) + return rq->bio->bi_bdev->bd_disk->disk_name; +#else + if (rq->rq_disk) + return rq->rq_disk->disk_name; +#endif + return NULL; +} + +/* + * inode_hashtable + * inode_hash_lock + * ihash_entries + * nr_kernel_pages + * i_hash_shift + */ +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp, ret; + unsigned int *ihash_shift; + unsigned long entry, mask; + + if (!p_test_var_5) + return 0; + + ihash_shift = (unsigned int *)(p_test_var_5); + entry = 1 << *ihash_shift; + mask = entry - 1; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> *p_test_var_5); + ret = tmp & mask; + + return ret; +} +bool io_scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + int i; + bool ret = false, print; + struct task_struct *p; + + if (sysctl_enable_debug == SCENE_ADJUST_MORE_PARAMETER_ONE + && print_once && strstr(ftrace_hook_name, functions[9]) + && strstr(current->comm, "os_stat_test")) { + for_each_process(p) { + struct files_struct *files; + struct fdtable *fdt; + struct file *file; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + struct inode *inode = NULL; + int limit; + unsigned int maxfd; + struct hlist_head *inode_hashtable_tmp = (struct hlist_head *)(*p_test_var_1); + unsigned long *ihash_entries = (unsigned long *)p_test_var_3; + spinlock_t *lock = (spinlock_t *)p_test_var_2; + + if (!inode_hashtable_tmp || !ihash_entries || !lock) + goto next; + + limit = task_rlimit(p, RLIMIT_NOFILE); + + if(need_resched()) + schedule(); + + if (!p->files) + continue; + fdt = files_fdtable(p->files); + if (!fdt) + continue; + maxfd = fdt->max_fds; + for (i = 0; i < maxfd && i < limit; i++) { + file = fdt->fd[i]; + + if(need_resched()) + schedule(); + + if (!file) + continue; + inode = file->f_inode; + if (!inode || !inode->i_sb || !inode->i_sb->s_bdev || !inode->i_sb->s_bdev->bd_disk) + continue; + ei = EXT4_I(inode); + sbi = EXT4_SB(inode->i_sb); + } + } + goto next; + } + + if ((sysctl_enable_debug == SCENE_ADJUST_ENABLE_ONLY + || sysctl_enable_debug == SCENE_ADJUST_MORE_PARAMETER_TWO) + && print_once && strstr(ftrace_hook_name, functions[9]) + && strstr(current->comm, "os_stat_test")) { + struct files_struct *files; + struct file *file = (struct file *)arg1; + struct fdtable *fdt; + struct file *f; + unsigned long nr, i, n, ino; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + struct inode *inode = NULL; + struct task_struct *p; + struct hlist_head *inode_hashtable_tmp = (struct hlist_head *)(*p_test_var_1); + spinlock_t *lock = (spinlock_t *)p_test_var_2; + unsigned long *ihash_entries = (unsigned long *)p_test_var_3; + int size = sizeof(struct hlist_head); + struct hlist_head *head; + struct super_block *sb; + unsigned int *ihash_shift = (unsigned int *)(p_test_var_5); + unsigned long entry; + unsigned long mask, j, m; + + if (file) + inode = file->f_inode; + if (inode) + sb = inode->i_sb; + + if (!inode_hashtable_tmp || !ihash_entries || !lock) + goto next; + + spin_lock(lock); + print_once = false; + entry = 1 << *ihash_shift; + mask = entry - 1; + for (n = 0; n < entry; n++) { + if(need_resched()) { + spin_unlock(lock); + schedule(); + spin_lock(lock); + } + + head = inode_hashtable_tmp + n; + + if (n == inode_hash1 || n == inode_hash2) { + if (n == inode_hash1) + inode_res1 = 0; + if (n == inode_hash2) + inode_res2 = 0; + } + if (!head) { + continue; + } + if (hlist_empty(head)) { + continue; + } + j = 0; + m = 0; + hlist_for_each_entry(inode, head, i_hash) { + if(need_resched()) { + spin_unlock(lock); + schedule(); + spin_lock(lock); + } + j++; + if (!inode || !inode->i_sb || !inode->i_sb->s_bdev + || !inode->i_sb->s_bdev->bd_disk) + continue; + + ei = EXT4_I(inode); + sbi = EXT4_SB(inode->i_sb); + if (ei && strstr(inode->i_sb->s_bdev->bd_disk->disk_name, printk_name_last) + && ei->i_reserved_data_blocks > 0) { + m++; + stat_write[0] = ei->i_reserved_data_blocks; + stat_process[0] = inode->i_ino; + store_info(current->pid, stat_write[0], stat_process[0], current->comm, + printk_name_last, "inode"); + } + } + } + spin_unlock(lock); + + ret = true; + goto next; + + } + + if (strstr(ftrace_hook_name, functions[10])) { + struct request *rq = (struct request *)arg1; + struct bio *bio; + struct page *page = NULL; + bool new_io = arg2; + unsigned int nr_segs; + if (!new_io) + goto next; + if (!rq) + goto next; + if (!disk_check(rq)) + goto next; + bio = rq->bio; + nr_segs = rq->nr_phys_segments; + if (bio && bio->bi_io_vec && bio->bi_io_vec->bv_page) + page = bio->bi_io_vec->bv_page; +#ifdef TK5 + if (bio && page && bio->bi_bdev && bio->bi_bdev->bd_disk) +#else + if (bio && page && rq->rq_disk) +#endif +#ifndef TK2 + pr_info("sector:%10ld, size:%8ld, segs:4%d, addr:%16lx, %16s, %d, disk:%s", + bio->bi_iter.bi_sector,bio->bi_iter.bi_size, + nr_segs, + page_to_phys(page), + current->comm, current->pid, get_disk_name(rq)); +#else + ; +#endif + } + if (strstr(ftrace_hook_name, functions[3]) + || strstr(ftrace_hook_name, functions[11]) + || strstr(ftrace_hook_name, functions[12]) + || strstr(ftrace_hook_name, functions[13]) + || strstr(ftrace_hook_name, functions[14]) + || strstr(ftrace_hook_name, functions[15]) + || strstr(ftrace_hook_name, functions[16]) + || strstr(ftrace_hook_name, functions[17]) + ) { + + if (strstr(ftrace_hook_name, functions[11])) { + struct bio *bio = (struct bio *)arg1; + if (sysctl_enable_debug == 2) { +#if !defined TK3 && !defined TK2 + bio->kabi_reserved1 = 1 << BIO_SUBMIT; +#endif + goto next; + } + } + + if (strstr(ftrace_hook_name, functions[14]) + || strstr(ftrace_hook_name, functions[18])) { + struct inode *inode = (struct inode *)arg1; + if (inode && inode->i_sb && inode->i_sb->s_type + && !strstr(inode->i_sb->s_type->name, printk_name_first)) + goto next; + if (sysctl_enable_debug + && !strstr(inode->i_sb->s_bdev->bd_disk->disk_name, printk_name_last)) + goto next; +#if !defined TK3 && !defined TK2 + if (inode->kabi_reserved1 != sysctl_trace_type) +#endif + goto next; + } + + if (strstr(ftrace_hook_name, functions[12])) { + pr_info("queue io: users rw %ld(ms) ago ", jiffies_to_msecs(jiffies - arg3)); + } + + if (strstr(ftrace_hook_name, functions[15])) { + struct inode *inode = (struct inode *)arg1; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + if (inode && inode->i_sb && inode->i_sb->s_bdev + && inode->i_sb->s_bdev->bd_disk) { + if (sysctl_enable_debug + && !strstr(inode->i_sb->s_bdev->bd_disk->disk_name, printk_name_last)) + goto next; + stat_block_dec += arg2; + store_info(current->pid, arg2, stat_block_dec, current->comm, + inode->i_sb->s_bdev->bd_disk->disk_name, + "dec:1"); + } + } + + if(strstr(ftrace_hook_name, functions[3])) { + struct ext4_allocation_request *ar = (struct ext4_allocation_request *)arg2; + goto next; + } + + if (strstr(ftrace_hook_name, functions[16])) + pr_info("process:%16s, pid:%8d, current time:%ld(ns)", + current->comm, current->pid, sched_clock()); + + print = true; + if (strstr(ftrace_hook_name, functions[15])) + print = false; + match_dec_index(arg2, print, 1); + } + + if (strstr(ftrace_hook_name, functions[17])) { + struct task_struct *p = (struct task_struct *)arg1; + if (!p) + goto next; + if (!strstr(p->comm, "kworker")) + goto next; + pr_info("current:%16s, %8d, target:%16s, %8d", current->comm, current->pid, + p->comm, p->pid); + } + + io_scene_bfq_before(arg1, arg2, arg3, arg4, arg5, arg6); + +next: + return ret; +} +void io_scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long ret) +{ + if (strstr(ftrace_hook_name, functions[19]) && ret) { + int i; + struct list_head *dispatch_queue = (struct list_head *)arg2; + struct inode *inode; + unsigned long time = sched_clock(); +#if !defined TK3 && !defined TK2 + list_for_each_entry(inode, dispatch_queue, i_io_list) + pr_info("move expired:inode num:%ld, delta:%ld, cur:%ld, last:%ld, timeout:%ld, cur:%ld, %ld", + inode->i_ino, time - inode->kabi_reserved2, + time, inode->kabi_reserved2, arg3, jiffies, jiffies > arg3); +#endif + + for (i = 0; i < IO_ARRAY_ITEM; i++) { + if (stat_process[i] == current->pid || stat_process[i] == 0) + break; + } + + if (i >= IO_ARRAY_ITEM) + goto out; + + stat_write[i]++; + if (stat_process[i] == 0) { + stat_process[i] = current->pid; + strncpy(stat_proc_comm[i], current->comm, 16); + } + + if (stat_write[i] != 0 && stat_write[i] % 100 == 0) + pr_info("%s, proc:%20s, %5d, write:%d, %d, %d", + ftrace_hook_name, current->comm, current->pid, + stat_write[i], stat_process[i], i); + } + +out: + return; +} + +unsigned long do_io_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8, bool *done, int name) +{ + unsigned long ret = 0; + + *done = false; + if (name != 0) + goto out; + + if (strstr(ftrace_hook_name, functions[20])) { + *done = true; + ret = adjust_ext4_nonda_switch(arg1); + } +out: + return ret; +} + +#define DEFINE_IO_BEFORE_AFTER(name) \ +unsigned long io_scene_before_##name(unsigned long arg1, unsigned long arg2, \ + unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6) \ +{ \ + unsigned long ret; \ +\ + return ret; \ +};\ +void io_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, \ + unsigned long arg4, unsigned long arg5, unsigned long ret, \ + unsigned long test) \ +{ \ + if (name == 6) \ + io_scene_bfq_after_6(arg1, arg2, arg3, arg4, arg5, ret, test); \ + if (name == 7) \ + io_scene_bfq_after_7(arg1, arg2, arg3, arg4, arg5, ret, test); \ +}; +//DEFINE_IO_BEFORE_AFTER(5) +//DEFINE_IO_BEFORE_AFTER(6) +DEFINE_IO_BEFORE_AFTER(7) +DEFINE_IO_BEFORE_AFTER(8) +DEFINE_IO_BEFORE_AFTER(9) +DEFINE_IO_BEFORE_AFTER(10) +DEFINE_IO_BEFORE_AFTER(11) +DEFINE_IO_BEFORE_AFTER(12) +DEFINE_IO_BEFORE_AFTER(13) +DEFINE_IO_BEFORE_AFTER(14) +DEFINE_IO_BEFORE_AFTER(15) +DEFINE_IO_BEFORE_AFTER(16) +DEFINE_IO_BEFORE_AFTER(17) +DEFINE_IO_BEFORE_AFTER(18) +DEFINE_IO_BEFORE_AFTER(19) +DEFINE_IO_BEFORE_AFTER(20) +DEFINE_IO_BEFORE_AFTER(21) +DEFINE_IO_BEFORE_AFTER(22) +DEFINE_IO_BEFORE_AFTER(23) +DEFINE_IO_BEFORE_AFTER(24) +DEFINE_IO_BEFORE_AFTER(25) +DEFINE_IO_BEFORE_AFTER(26) +DEFINE_IO_BEFORE_AFTER(27) +DEFINE_IO_BEFORE_AFTER(28) +DEFINE_IO_BEFORE_AFTER(29) +DEFINE_IO_BEFORE_AFTER(30) +DEFINE_IO_BEFORE_AFTER(31) +DEFINE_IO_BEFORE_AFTER(32) +DEFINE_IO_BEFORE_AFTER(33) +DEFINE_IO_BEFORE_AFTER(34) +DEFINE_IO_BEFORE_AFTER(35) +DEFINE_IO_BEFORE_AFTER(36) +DEFINE_IO_BEFORE_AFTER(37) +DEFINE_IO_BEFORE_AFTER(38) +DEFINE_IO_BEFORE_AFTER(39) +DEFINE_IO_BEFORE_AFTER(40) +DEFINE_IO_BEFORE_AFTER(41) +DEFINE_IO_BEFORE_AFTER(42) +DEFINE_IO_BEFORE_AFTER(43) +DEFINE_IO_BEFORE_AFTER(44) +DEFINE_IO_BEFORE_AFTER(45) +DEFINE_IO_BEFORE_AFTER(46) +DEFINE_IO_BEFORE_AFTER(47) +DEFINE_IO_BEFORE_AFTER(48) +DEFINE_IO_BEFORE_AFTER(49) +DEFINE_IO_BEFORE_AFTER(50) +void init_io_scene(void) +{ + io_scene_bfq_init(); +} + +void exit_io_scene(void) +{ +} diff --git a/ops/os_stat/os_stat/kprobe_prehook.c b/ops/os_stat/os_stat/kprobe_prehook.c new file mode 100644 index 0000000000000000000000000000000000000000..4844954103bad6a8cc8143d8cb8bef89d533a70a --- /dev/null +++ b/ops/os_stat/os_stat/kprobe_prehook.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NOTE: This example is works on x86 and powerpc. + * Here's a sample kernel module showing the use of kprobes to dump a + * stack trace and selected registers when _do_fork() is called. + * + * For more information on theory of operation of kprobes, see + * Documentation/kprobes.txt + * + * You will see the trace data in /var/log/messages and on the console + * whenever _do_fork() is invoked to create a new process. + */ + +#include +#include +#include +#include "kprobe_prehook.h" +#include "data_aware.h" + +static char symbol[NAME_MAX] = "syscall_return_slowpath"; +char symbol_new[NAME_MAX] = "syscall_return_slowpath"; +unsigned int sysctl_kprobe_unregister; + +/* For each probe you need to allocate a kprobe structure */ +static struct kprobe kp = { + .symbol_name = symbol, +}; + +static struct kprobe kp_new = { + .symbol_name = symbol_new, +}; +/* +void stat_do_syscall_64(unsigned long nr, struct pt_regs *regs) +{ + pr_info("sys:%ld, %ld", nr, regs->orig_ax); + do_syscall_64(nr, regs); +} +*/ + +/* kprobe pre_handler: called just before the probed instruction is executed */ +static int handler_pre(struct kprobe *p, struct pt_regs *regs) +{ + //stat_stat_syscall_enter(); + /* A dump_stack() here will give a stack backtrace */ + return 0; +} + +/* kprobe post_handler: called after the probed instruction is executed */ +static void handler_post(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ +} + +/* + * fault_handler: this is called if an exception is generated for any + * instruction within the pre- or post-handler, or when Kprobes + * single-steps the probed instruction. + */ +static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) +{ + pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); + /* Return 0 because we don't handle the fault. */ + return 0; +} + +static int register_new_func(char *name) +{ + int ret; + + memcpy(symbol_new, name, sizeof(name)); + kp_new.pre_handler = handler_pre; + kp_new.post_handler = handler_post; + ret = register_kprobe(&kp_new); + if (ret < 0) { + pr_err("register_kprobe failed, returned %d, %s\n", ret, name); + return ret; + } + return 0; +} + +static void ungister_new_func(void) +{ + unregister_kprobe(&kp_new); +} + +int sysctl_kprobe_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + register_new_func(symbol_new); + + return ret; +} + +int sysctl_kprobe_disable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + ungister_new_func(); + + return ret; +} +int __kprobe_init(void) +{ + int ret; + kp.pre_handler = handler_pre; + kp.post_handler = handler_post; +#ifndef TK5 + kp.fault_handler = handler_fault; +#endif + + ret = register_kprobe(&kp); + if (ret < 0) { + pr_err("register_kprobe failed, returned %d\n", ret); + return ret; + } + pr_info("Planted kprobe at %p\n", kp.addr); + return 0; +} +int kprobe_init(void) +{ + //return __kprobe_init(); + return 0; +} + +void __kprobe_exit(void) +{ + unregister_kprobe(&kp); + pr_info("kprobe at %p unregistered\n", kp.addr); + + return; +} + +void kprobe_exit(void) +{ + //__kprobe_exit(); + return; +} + diff --git a/ops/os_stat/os_stat/kprobe_prehook.h b/ops/os_stat/os_stat/kprobe_prehook.h new file mode 100644 index 0000000000000000000000000000000000000000..63b4e1981de2d6c96e677ca6722b22dfb6aaa1ad --- /dev/null +++ b/ops/os_stat/os_stat/kprobe_prehook.h @@ -0,0 +1,101 @@ +/* + * Kpatch module + * + * The core code comes from tpatch, the early hot patch tool of tlinux. + * + */ +#ifndef _KPROBE_PREHOOK_H +#define _KPROBE_PREHOOK_H + +#include "hook_tk5.h" + +#define MAX_SYMBOL_LEN 64 +#ifdef TK5 +#define FUNCTION_PRINT_FORMAT "%pS" +#else +#define FUNCTION_PRINT_FORMAT "%pF" +#endif +#if defined(TK3) || defined(TK2) +#define NAME_MAX 255 +#endif + +#define HOOK_FUNC_NUM 50 + +enum ftrace_status { + FTRACE_INIT = 0, + FTRACE_REGISTER = 2, + FTRACE_UNREGISTERING = 3, + FTRACE_UNREGISTE_STARTED = 4, + FTRACE_UNREGISTED = 5, + FTRACE_REGISTER_FAILED = 6 +}; +enum ftrace_control_status { + FTRACE_CONTROL_INIT = 0, + FTRACE_CONTROL_CLEAR = 1, + FTRACE_CONTROL_POINTER = 2, + FTRACE_CONTROL_DELTA = 3, + FTRACE_CONTROL_PRINT = 6 +}; +/* per-instance private data */ +struct kret_data { + unsigned long nr; + unsigned long num; + unsigned long time[8]; +}; + +typedef unsigned long (*func_type)(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8); +typedef unsigned long (**func_body_type)(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8); + +extern struct work_struct ftrace_work; +extern struct work_struct ftrace_work_init; +extern int register_kretprobe_ftrace; +extern int register_ftrace_ftrace; +extern char ftrace_hook_name[NAME_MAX]; +extern char func_name_new[NAME_MAX]; +extern unsigned int sysctl_kprobe_unregister; +extern char symbol_new[NAME_MAX]; +extern char symbol_kret_new[NAME_MAX]; +extern char show_parameter_val[NAME_MAX]; +extern char show_parameter_type[NAME_MAX]; +extern unsigned long stat_func_total_time[HOOK_FUNC_NUM]; +extern unsigned long stat_func_block_time[HOOK_FUNC_NUM]; +extern unsigned long stat_func_total_num[HOOK_FUNC_NUM]; +extern unsigned long stat_one_func_size; +extern unsigned long stat_hook_function; + +extern int kprobe_init(void); +extern void kprobe_exit(void); +extern int kretprobe_init(void); +extern void kretprobe_exit(void); +extern bool print_func_name(void *opt, void *opt2, void *opt3, char *pointer, char *func); +extern void print_info(void *opt, void *opt2, void *opt3); +extern char func_pointer[NAME_MAX]; +extern char func_pointer_name[NAME_MAX]; +extern char printk_name_first[NAME_MAX]; +extern char printk_name_last[NAME_MAX]; +extern void ftrace_unhook_work_fn(struct work_struct *work); +extern void ftrace_hook_work_fn(struct work_struct *work); +extern bool sub_print_func_name(void *opt, void *opt2, void *opt3, char *pointer, char *func); +extern bool check_func_name(char *func); +extern unsigned long *stat_one_func_time; +extern void save_sched_in(void); +extern void save_sched_out(void); +extern void stat_total_time(void); +extern char *get_one_func_name(int i); +extern void save_start_time(int index); +extern void save_total_time(int index, bool this); +extern int register_kret_new_func_batch(int index, char* func_name); +extern void unregister_kret_new_func_batch(int index); +extern int get_one_func_count(void); +extern func_type get_one_func(char *name); +extern func_body_type get_one_func_body(char *name); +extern unsigned long **get_one_var(char *name); +extern void catch_kill_signal(unsigned long arg1, unsigned long arg2, unsigned long arg3); + +extern int ftrace_patch_init(char *name); +extern void ftrace_patch_exit(void); +#endif diff --git a/ops/os_stat/os_stat/kretprobe_prehook.c b/ops/os_stat/os_stat/kretprobe_prehook.c new file mode 100644 index 0000000000000000000000000000000000000000..09d257bcae9a6da470eec58996d7e2e7b07270d3 --- /dev/null +++ b/ops/os_stat/os_stat/kretprobe_prehook.c @@ -0,0 +1,631 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kretprobe_example.c + * + * Here's a sample kernel module showing the use of return probes to + * report the return value and total time taken for probed function + * to run. + * + * usage: insmod kretprobe_example.ko func= + * + * If no func_name is specified, _do_fork is instrumented + * + * For more information on theory of operation of kretprobes, see + * Documentation/kprobes.txt + * + * Build and insert the kernel module as done in the kprobe example. + * You will see the trace data in /var/log/messages and on the console + * whenever the probed function returns. (Some messages may be suppressed + * if syslogd is configured to eliminate duplicate messages.) + */ + +#include +#include +#include +#include +#include +#include +#include "kprobe_prehook.h" +#include "data_aware.h" +#include "hook.h" + +#ifdef TK5 +static char func_name[NAME_MAX] = "x64_sys_call"; +#else +static char func_name[NAME_MAX] = "do_syscall_64"; +#endif +char symbol_kret_new[NAME_MAX]; +char func_name_new[NAME_MAX]; +DEFINE_MUTEX(kret_hook_func); +int register_kretprobe_ftrace = 0; +int register_ftrace_ftrace = FTRACE_INIT; +int register_syscall_ftrace = 0; +int more_register_kretprobe_ftrace[30] = {0}; + +#ifdef CONFIG_X86_64 +/******stat_do_syscall_64: only for test********/ +void stat_do_syscall_64(unsigned long nr, struct pt_regs *regs) +{ + do_syscall_64(nr, regs); +} +#endif + +/* Here we use the entry_hanlder to timestamp function entry */ +static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct kret_data *data; + + if (!current->mm) + return 1; /* Skip kernel threads */ +#ifndef TK2 + current->numa_faults_locality[0] = 0; + current->numa_faults_locality[1] = 0; + current->numa_faults_locality[2] = 0; +#endif + data = (struct kret_data *)ri->data; + +#ifdef CONFIG_X86_64 + set_sys_nr(regs->ax); + data->nr = regs->ax; +#elif defined CONFIG_ARM64 + set_sys_nr(regs->syscallno); + data->nr = regs->syscallno; +#endif + stat_stat_syscall_enter(data); +#if 0 + pr_info("kret sys:%ld", regs->ax); + data = (struct kret_data *)ri->data; + data->entry_stamp = ktime_get(); +#endif + return 0; +} + +static int new_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct kret_data *data = (struct kret_data *)ri->data; + + stat_func_enter(data); + return 0; +} + +/* + * Return-probe handler: Log the return value and duration. Duration may turn + * out to be zero consistently, depending upon the granularity of time + * accounting on the platform. + */ +static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct kret_data *data = (struct kret_data *)ri->data; + + stat_stat_syscall_exit(get_sys_nr(), data); +#ifndef TK2 + current->numa_faults_locality[0] = 0; + current->numa_faults_locality[1] = 0; + current->numa_faults_locality[2] = 0; +#endif +#if 0 + unsigned long retval = regs_return_value(regs); + struct kret_data *data = (struct kret_data *)ri->data; + s64 delta; + ktime_t now; + + now = ktime_get(); + delta = ktime_to_ns(ktime_sub(now, data->entry_stamp)); + pr_info("%s returned %lu and took %lld ns to execute:%ld\n", + func_name, retval, (long long)delta, regs->ax); +#endif + return 0; +} + +static int new_ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct kret_data *data = (struct kret_data *)ri->data; + + stat_func_exit(data); + return 0; +} +static struct kretprobe resident_kretprobe = { + .handler = ret_handler, + .entry_handler = entry_handler, + .data_size = sizeof(struct kret_data), + /* Probe up to 20 instances concurrently. */ + .maxactive = 20, +}; +static struct kretprobe new_kretprobe = { + .handler = new_ret_handler, + .entry_handler = new_entry_handler, + .data_size = sizeof(struct kret_data), + /* Probe up to 20 instances concurrently. */ + .maxactive = 200, +}; +/* for one function hook */ +static int register_kret_new_func(char *name) +{ + int ret; + + strcpy(func_name_new, name); + new_kretprobe.kp.symbol_name = func_name_new; + new_kretprobe.handler = new_ret_handler; + new_kretprobe.entry_handler = new_entry_handler; + new_kretprobe.data_size = sizeof(struct kret_data); + new_kretprobe.maxactive = 200; + ret = register_kretprobe(&new_kretprobe); + if (ret < 0) { + pr_err("register_kretprobe failed, returned %d, %s, %s\n", ret, name, func_name_new); + return ret; + } + pr_info("Planted return probe at %s: %p\n", + new_kretprobe.kp.symbol_name, new_kretprobe.kp.addr); + return 0; +} + +static void ungister__kret_new_func(void) +{ + pr_info("%s, unregister_kretprobe %s\n", __func__, func_name_new); + unregister_kretprobe(&new_kretprobe); + memset(&new_kretprobe, 0, sizeof(new_kretprobe)); +} +/* for hot path batch process */ +#define KRET_FUNC(index) \ +char func_name_new_##index[NAME_MAX]; \ +static int new_entry_handler_##index(struct kretprobe_instance *ri, struct pt_regs *regs) \ +{ \ +\ + save_start_time(index);\ + return 0;\ +}\ +static int new_ret_handler_##index(struct kretprobe_instance *ri, struct pt_regs *regs)\ +{\ + /* stat latency between test_func1 and test_func */ \ + if (sysctl_trace_type == 1) \ + save_total_time(1, false); \ + else \ + save_total_time(index, true);\ + return 0;\ +}\ +static struct kretprobe new_kretprobe_##index = {\ + .handler = new_ret_handler_##index,\ + .entry_handler = new_entry_handler_##index,\ + .data_size = sizeof(struct kret_data),\ + .maxactive = 200,\ +};\ +static int register_kret_new_func_##index(char *func_name)\ +{\ + int ret;\ +\ + mutex_lock(&kret_hook_func); \ + more_register_kretprobe_ftrace[index - 1] = 1; \ + strcpy(func_name_new_##index, func_name);\ + new_kretprobe_##index.kp.symbol_name = func_name_new_##index;\ + new_kretprobe_##index.handler = new_ret_handler_##index;\ + new_kretprobe_##index.entry_handler = new_entry_handler_##index;\ + new_kretprobe_##index.data_size = sizeof(struct kret_data);\ + new_kretprobe_##index.maxactive = 200;\ + ret = register_kretprobe(&new_kretprobe_##index);\ + if (ret < 0) {\ + pr_err("register_kretprobe failed, returned %d, %s, %s\n", ret, func_name, func_name_new_##index);\ + mutex_unlock(&kret_hook_func); \ + return ret;\ + }\ + pr_info("%s Planted return probe at %s: %p, %d\n", __func__, new_kretprobe_##index.kp.symbol_name, new_kretprobe_##index.kp.addr, index); \ + mutex_unlock(&kret_hook_func); \ + return 0;\ +}\ +static void unregister_kret_new_func_##index(void)\ +{\ + mutex_lock(&kret_hook_func); \ + if (more_register_kretprobe_ftrace[index - 1] == 0) \ + goto out; \ + more_register_kretprobe_ftrace[index - 1] = 0; \ + pr_info("%s unregister_kretprobe %d, %s, %d\n", __func__, index, new_kretprobe_##index.kp.symbol_name, more_register_kretprobe_ftrace[index]);\ + unregister_kretprobe(&new_kretprobe_##index);\ + memset(&new_kretprobe, 0, sizeof(new_kretprobe));\ +out: \ + mutex_unlock(&kret_hook_func); \ +} +KRET_FUNC(1) +KRET_FUNC(2) +KRET_FUNC(3) +KRET_FUNC(4) +KRET_FUNC(5) +KRET_FUNC(6) +KRET_FUNC(7) +KRET_FUNC(8) +KRET_FUNC(9) +KRET_FUNC(10) +KRET_FUNC(11) +KRET_FUNC(12) +KRET_FUNC(13) +KRET_FUNC(14) +KRET_FUNC(15) +KRET_FUNC(16) +KRET_FUNC(17) +KRET_FUNC(18) +KRET_FUNC(19) +KRET_FUNC(20) +KRET_FUNC(21) +KRET_FUNC(22) +KRET_FUNC(23) +KRET_FUNC(24) +KRET_FUNC(25) +KRET_FUNC(26) +KRET_FUNC(27) +KRET_FUNC(28) +KRET_FUNC(29) +KRET_FUNC(30) + +int register_kret_new_func_batch(int index, char* func_name) +{ + int ret = -1; + + switch(index) { + case 1: + ret = register_kret_new_func_1(func_name); + break; + case 2: + ret = register_kret_new_func_2(func_name); + break; + case 3: + ret = register_kret_new_func_3(func_name); + break; + case 4: + ret = register_kret_new_func_4(func_name); + break; + case 5: + ret = register_kret_new_func_5(func_name); + break; + case 6: + ret = register_kret_new_func_6(func_name); + break; + case 7: + ret = register_kret_new_func_7(func_name); + break; + case 8: + ret = register_kret_new_func_8(func_name); + break; + case 9: + ret = register_kret_new_func_9(func_name); + break; + case 10: + ret = register_kret_new_func_10(func_name); + break; + case 11: + ret = register_kret_new_func_11(func_name); + break; + case 12: + ret = register_kret_new_func_12(func_name); + break; + case 13: + ret = register_kret_new_func_13(func_name); + break; + case 14: + ret = register_kret_new_func_14(func_name); + break; + case 15: + ret = register_kret_new_func_15(func_name); + break; + case 16: + ret = register_kret_new_func_16(func_name); + break; + case 17: + ret = register_kret_new_func_17(func_name); + break; + case 18: + ret = register_kret_new_func_18(func_name); + break; + case 19: + ret = register_kret_new_func_19(func_name); + break; + case 20: + ret = register_kret_new_func_20(func_name); + break; + case 21: + ret = register_kret_new_func_21(func_name); + break; + case 22: + ret = register_kret_new_func_22(func_name); + break; + case 23: + ret = register_kret_new_func_23(func_name); + break; + case 24: + ret = register_kret_new_func_24(func_name); + break; + case 25: + ret = register_kret_new_func_25(func_name); + break; + case 26: + ret = register_kret_new_func_26(func_name); + break; + case 27: + ret = register_kret_new_func_27(func_name); + break; + case 28: + ret = register_kret_new_func_28(func_name); + break; + case 29: + ret = register_kret_new_func_29(func_name); + break; + case 30: + ret = register_kret_new_func_30(func_name); + break; + default: + break; + } + + return ret; +} +void unregister_kret_new_func_batch(int index) +{ + switch(index) { + case 1: + unregister_kret_new_func_1(); + break; + case 2: + unregister_kret_new_func_2(); + break; + case 3: + unregister_kret_new_func_3(); + break; + case 4: + unregister_kret_new_func_4(); + break; + case 5: + unregister_kret_new_func_5(); + break; + case 6: + unregister_kret_new_func_6(); + break; + case 7: + unregister_kret_new_func_7(); + break; + case 8: + unregister_kret_new_func_8(); + break; + case 9: + unregister_kret_new_func_9(); + break; + case 10: + unregister_kret_new_func_10(); + break; + case 11: + unregister_kret_new_func_11(); + break; + case 12: + unregister_kret_new_func_12(); + break; + case 13: + unregister_kret_new_func_13(); + break; + case 14: + unregister_kret_new_func_14(); + break; + case 15: + unregister_kret_new_func_15(); + break; + case 16: + unregister_kret_new_func_16(); + break; + case 17: + unregister_kret_new_func_17(); + break; + case 18: + unregister_kret_new_func_18(); + break; + case 19: + unregister_kret_new_func_19(); + break; + case 20: + unregister_kret_new_func_20(); + break; + case 21: + unregister_kret_new_func_21(); + break; + case 22: + unregister_kret_new_func_22(); + break; + case 23: + unregister_kret_new_func_23(); + break; + case 24: + unregister_kret_new_func_24(); + break; + case 25: + unregister_kret_new_func_25(); + break; + case 26: + unregister_kret_new_func_26(); + break; + case 27: + unregister_kret_new_func_27(); + break; + case 28: + unregister_kret_new_func_28(); + break; + case 29: + unregister_kret_new_func_29(); + break; + case 30: + unregister_kret_new_func_30(); + break; + default: + break; + } +} + +int sysctl_kretprobe_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + if (!write) + return ret; + + if (register_kretprobe_ftrace == 1) { + pr_err("need unregister kretprobe function:maybe:%s", symbol_new); + return -EPERM; + } + mutex_lock(&kret_hook_func); + + ret = register_kret_new_func(symbol_new); + register_kretprobe_ftrace = 1; + if (ret < 0) { + register_kretprobe_ftrace = 0; + strcpy(ftrace_hook_name, symbol_new); + if (register_ftrace_ftrace == FTRACE_REGISTER_FAILED) + register_ftrace_ftrace = FTRACE_INIT; + schedule_work(&ftrace_work_init); + } + + mutex_unlock(&kret_hook_func); + return 0; +} + +int sysctl_kretprobe_disable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dostring(table, write, buffer, lenp, ppos); + + mutex_lock(&kret_hook_func); + + if (register_kretprobe_ftrace == 1) + ungister__kret_new_func(); + + schedule_work_on(0, &ftrace_work); + + register_kretprobe_ftrace = 0; + + mutex_unlock(&kret_hook_func); + return ret; +} + +int syscall_hook_init(void) +{ + int ret; + + mutex_lock(&kret_hook_func); + + if (register_syscall_ftrace) { + pr_info("syscall hooked already"); + mutex_unlock(&kret_hook_func); + return 0; + } + resident_kretprobe.kp.symbol_name = func_name; + ret = register_kretprobe(&resident_kretprobe); + if (ret < 0) { + pr_err("register_kretprobe failed, returned %d\n", ret); + mutex_unlock(&kret_hook_func); + return ret; + } + register_syscall_ftrace = 1; + pr_info("Planted return probe at %s: %p\n", + resident_kretprobe.kp.symbol_name, resident_kretprobe.kp.addr); + mutex_unlock(&kret_hook_func); + return 0; +} + +void syscall_hook_exit(void) +{ + mutex_lock(&kret_hook_func); + + if (register_syscall_ftrace == 0) + goto out; + + register_syscall_ftrace = 0; + + pr_info("%s, unregister_kretprobe\n", __func__); + unregister_kretprobe(&resident_kretprobe); + pr_info("kretprobe at %p unregistered\n", resident_kretprobe.kp.addr); + /* nmissed > 0 suggests that maxactive was set too low. */ + pr_info("Missed probing %d instances of %s\n", + resident_kretprobe.nmissed, resident_kretprobe.kp.symbol_name); +out: + mutex_unlock(&kret_hook_func); +} + +int sysctl_system_hook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + syscall_hook_init(); + system_base_function_hook(); + + return ret; +} +int sysctl_system_unhook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sysctl_module_disable) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + syscall_hook_exit(); + system_base_function_unhook(); + return ret; +} + +int kretprobe_init(void) +{ + return 0; +} +void kretprobe_exit(void) +{ + int i; + + mutex_lock(&kret_hook_func); + if (register_kretprobe_ftrace == 1) + ungister__kret_new_func(); + register_kretprobe_ftrace = 0; + mutex_unlock(&kret_hook_func); + + for(i = 1; i <= 30; i++) + unregister_kret_new_func_batch(i); + + + while (register_ftrace_ftrace == FTRACE_UNREGISTERING) + schedule_timeout(1000); + + ftrace_unhook_work_fn(&ftrace_work); + + if (register_syscall_ftrace == 1) + syscall_hook_exit(); + + register_kretprobe_ftrace = 0; + register_syscall_ftrace = 0; +} + diff --git a/ops/os_stat/os_stat/main.c b/ops/os_stat/os_stat/main.c new file mode 100644 index 0000000000000000000000000000000000000000..fc1b40d9f52c0a5dfc6616ba2f82aea08c6ee724 --- /dev/null +++ b/ops/os_stat/os_stat/main.c @@ -0,0 +1,74 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "syms.h" +#include "hook.h" +#include "data_aware.h" +#include "sysctl.h" +#include "kprobe_prehook.h" +#include "hook_tk5.h" + +#define DRV_VERSION "1.0.0" + +static int __init main_init(void) +{ + int ret; + +#ifdef TK5 + ftrace_init_tk5(); +#endif + + ret = ksyms_init(); + if (ret < 0) + return ret; + + ret = patch_init(); + if (ret < 0) + goto err_patch; + + ret = kprobe_init(); + if (ret < 0) + goto err_kprobe; + ret = kretprobe_init(); + if (ret < 0) + goto err_kretprobe; + ret = sysctl_table_init(); + if (ret < 0) + goto err_sysctl_table; + + ret = data_init(); + if(ret < 0) + goto err_data_init; + + return 0; + +err_data_init: + sysctl_table_exit(); +err_sysctl_table: + kretprobe_exit(); +err_kretprobe: + kprobe_exit(); +err_kprobe: + patch_exit(); +err_patch: + return ret; +} + +static void __exit main_exit(void) +{ + unload_disable_module(); + sysctl_table_exit(); + kretprobe_exit(); + kprobe_exit(); + patch_exit(); + data_exit(); + pr_info("module exit"); + +} + +module_init(main_init); +module_exit(main_exit); +MODULE_DESCRIPTION("hook for test module for tlinux" UTS_RELEASE); +MODULE_AUTHOR("aurelianliu@tencent.com"); +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE("GPL"); diff --git a/ops/os_stat/os_stat/mm_scene/memory_scene.c b/ops/os_stat/os_stat/mm_scene/memory_scene.c new file mode 100644 index 0000000000000000000000000000000000000000..a89afcac03b2fb34e0f6dd7d385aeed91d08e396 --- /dev/null +++ b/ops/os_stat/os_stat/mm_scene/memory_scene.c @@ -0,0 +1,86 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#include "../scene_layer.h" +#include "../syms.h" +#include "../data_aware.h" +static unsigned long stat_time; +static unsigned long stat_data[MAX_ORDER]; +static unsigned long stat_data_process[MAX_ORDER][100]; +static unsigned long stat_data_time[MAX_ORDER][100]; + +static char functions[][32] = { + "__alloc_pages_nodemask", + "free_one_page" +}; +static unsigned long default_time = 100000; + +void mm_scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + unsigned long ret = 0; + + if ((sysctl_enable_debug == SCENE_ADJUST_MEM_STAT) && strstr(ftrace_hook_name, functions[0])) { + unsigned int order = arg2; + int i; + if(arg2 < MAX_ORDER && arg2 >= 0) { + stat_data[arg2]++; + for (i = 0; i < 100; i++) { + if (stat_data_process[arg2][i] == current->pid) + break; + if (stat_data_process[arg2][i] == 0) { + stat_data_process[arg2][i] = current->pid; + break; + } + } + if (i < 100) { + stat_data_time[arg2][i]++; + } + + } + if (sysctl_enable_debug == SCENE_ADJUST_MEM_STAT && jiffies - stat_time > default_time) { + int j; + for (j = 0; j < MAX_ORDER; j++) { + if (stat_data[j] != 0) + pr_info("order:%d, total num:%d", j, stat_data[j]); + for (i = 0; i < 100; i++) { + if (stat_data_time[j][i] != 0) + pr_info("order:%d, pid:%d, num:%d", + j, stat_data_process[j][i], stat_data_time[j][i]); + } + } + stat_time = jiffies; + } + } + + return; +} +bool mm_scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + bool ret = false; + + if ((sysctl_enable_debug == SCENE_ADJUST_MEM_STAT) && (strstr(ftrace_hook_name, functions[1]))) { + unsigned int order = arg4; + int i; + if(order < MAX_ORDER && order >= 0) { + stat_data[order]--; + for (i = 0; i < 100; i++) { + if (stat_data_process[order][i] == current->pid) + break; + if (stat_data_process[order][i] == 0) { + stat_data_process[order][i] = current->pid; + break; + } + } + if (i < 100) { + stat_data_time[order][i]--; + } + + } + ret = true; + } + + return ret; +} diff --git a/ops/os_stat/os_stat/net_scene/net_scene.c b/ops/os_stat/os_stat/net_scene/net_scene.c new file mode 100644 index 0000000000000000000000000000000000000000..cc183fd3d073453620cfbe5cd02ff02f6c5bdc1b --- /dev/null +++ b/ops/os_stat/os_stat/net_scene/net_scene.c @@ -0,0 +1,392 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#include "net_scene.h" + +static struct kmem_cache *net_cachep_adjust; +static char functions[][32] = { + "setup_net", + "deferred_put_nlk_sk", + "sk_alloc", + "__sk_destruct", + "sk_clone_lock", + "__put_net", + "net_drop_ns" +}; + +#if !defined TK3 && !defined TK2 +static void net_free(struct net *net) +{ + kfree(rcu_access_pointer(net->gen)); + /* There should not be any trackers left there. */ + if (net_cachep_adjust && ((struct net_tracer *)net)->net_ptr == net) { + ref_tracker_dir_exit(&((struct net_tracer *)net)->notrefcnt_tracker); + kmem_cache_free((struct kmem_cache *)net_cachep_adjust, net); + } + else + kmem_cache_free((struct kmem_cache *)p_test_var_2, net); +} + +void adjust_net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && refcount_dec_and_test(&ns->passive)) + net_free(ns); +} + +/* setup_net */ +unsigned long net_scene_before_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(1), functions[0])) { + struct net *net = (struct net *)arg1; + if (!net) + goto out; + if (net_cachep_adjust && ((struct net_tracer *)net)->net_ptr == net) { +#ifdef TK5 + ref_tracker_dir_init(&((struct net_tracer *)net)->refcnt_tracker, 128, "net refcnt"); + ref_tracker_dir_init(&((struct net_tracer *)net)->notrefcnt_tracker, 128, "net notrefcnt"); +#else + ref_tracker_dir_init(&((struct net_tracer *)net)->refcnt_tracker, 128); + ref_tracker_dir_init(&((struct net_tracer *)net)->notrefcnt_tracker, 128); +#endif + } + } +out: + return 0; +} + +void net_scene_after_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ +} + +/* deferred_put_nlk_sk */ +unsigned long net_scene_before_7(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(7), functions[1])) { + struct rcu_head *head = (struct rcu_head *)arg1; + struct netlink_sock *nlk = NULL; + struct sock *sk = NULL; + + if (head) + nlk = container_of(head, struct netlink_sock, rcu); + if (nlk) + sk = &nlk->sk; + /* Because struct net might disappear soon, do not keep a pointer. */ + if (sk && !sk->sk_net_refcnt && sock_net(sk) != &init_net) { + __netns_tracker_free(sock_net(sk), (netns_tracker *)&sk->kabi_reserved1, false); + /* Because of deferred_put_nlk_sk and use of work queue, + * it is possible netns will be freed before this socket. + */ + sock_net_set(sk, &init_net); + __netns_tracker_alloc(&init_net, (netns_tracker *)&sk->kabi_reserved1, + false, GFP_KERNEL); + } + } + return 0; +} + +void net_scene_after_7(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ +} + +/* sk_alloc */ +unsigned long net_scene_before_8(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + return 0; +} + +void net_scene_after_8(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (strstr(get_one_func_name(8), functions[2])) { + struct sock *sk = (struct sock *)ret; + struct net *net = (struct net *)arg1; + if (!net || sk) + goto out; + if (likely(sk->sk_net_refcnt)) + get_net_track(net, (netns_tracker *)&sk->kabi_reserved1, arg2); + else + __netns_tracker_alloc(net, (netns_tracker *)&sk->kabi_reserved1, false, arg2); + } +out: + return; +} + +/* __sk_destruct */ +unsigned long net_scene_before_9(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(9), functions[3])) { + struct rcu_head *head = (struct rcu_head *)arg1; + struct netlink_sock *nlk = NULL; + struct sock *sk = NULL; + + if (head) + nlk = container_of(head, struct netlink_sock, rcu); + if (nlk) + sk = &nlk->sk; + if (!sk) + goto out; + + if (likely(sk->sk_net_refcnt)) + put_net_track(sock_net(sk), (netns_tracker *)&sk->kabi_reserved1); + else + __netns_tracker_free(sock_net(sk), (netns_tracker *)&sk->kabi_reserved1, false); + } +out: + return 0; +} + +void net_scene_after_9(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ +} + +/* sk_clone_lock */ +unsigned long net_scene_before_10(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + return 0; +} + +void net_scene_after_10(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ + if (strstr(get_one_func_name(10), functions[4])) { + struct sock *sk = (struct sock *)ret; + if (likely(sk->sk_net_refcnt)) + get_net_track(sock_net(sk), (netns_tracker *)&sk->kabi_reserved1, arg2); + else + /* Kernel sockets are not elevating the struct net refcount. + * Instead, use a tracker to more easily detect if a layer + * is not properly dismantling its kernel sockets at netns + * destroy time. + */ + __netns_tracker_alloc(sock_net(sk), (netns_tracker *)&sk->kabi_reserved1, false, arg2); + } +} + +/* __put_net */ +unsigned long net_scene_before_11(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + if (strstr(get_one_func_name(11), functions[5])) { + struct net *net = (struct net *)arg1; + if (net && net_cachep_adjust && ((struct net_tracer *)net)->net_ptr == net) + ref_tracker_dir_exit(&((struct net_tracer *)net)->refcnt_tracker); + } + return 0; +} + +void net_scene_after_11(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test) +{ +} + +/* + * inc_ucount + * dec_ucount + */ +static struct ucounts *inc_net_namespaces(struct user_namespace *ns) +{ + unsigned long ret; + + ret = test_func3(ns, 0, 0, 0, 0, 0, 0, 0); + + return (struct ucounts *)ret; +} + +static void dec_net_namespaces(struct ucounts *ucounts) +{ + test_func4(ucounts, UCOUNT_NET_NAMESPACES, 0, 0, 0, 0, 0, 0); + return; +} + +/* + * net_alloc_generic + * net_cachep + */ +static struct net *net_alloc(void) +{ + struct net *net = NULL; + struct net_generic *ng; + + ng = test_func2(0, 0, 0, 0, 0, 0, 0, 0); + if (!ng) + goto out; + + if (!net_cachep_adjust) + net_cachep_adjust = kmem_cache_create("net_namespace_adjust", sizeof(struct net_tracer), + SMP_CACHE_BYTES, + SLAB_PANIC|SLAB_ACCOUNT, NULL); + if (net_cachep_adjust) + net = kmem_cache_zalloc(net_cachep_adjust, GFP_KERNEL); + else + net = kmem_cache_zalloc((struct kmem_cache *)p_test_var_2, GFP_KERNEL); + if (!net) + goto out_free; + if (net_cachep_adjust) + ((struct net_tracer *)net)->net_ptr = net; + +#ifdef CONFIG_KEYS + net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); + if (!net->key_domain) + goto out_free_2; + refcount_set(&net->key_domain->usage, 1); +#endif + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +#ifdef CONFIG_KEYS +out_free_2: + if (net_cachep_adjust && ((struct net_tracer *)net)->net_ptr == net) + kmem_cache_free((struct kmem_cache *)net_cachep_adjust, net); + else + kmem_cache_free((struct kmem_cache *)p_test_var_2, net); + net = NULL; +#endif +out_free: + kfree(ng); + goto out; +} + +/* + * setup_net + * net_drop_ns + * key_remove_domain + * pernet_ops_rwsem + */ +struct net *adjust_copy_net_ns(unsigned long flags, + struct user_namespace *user_ns, struct net *old_net) +{ + struct ucounts *ucounts; + struct net *net; + int rv; + + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + + ucounts = inc_net_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + net = net_alloc(); + if (!net) { + rv = -ENOMEM; + goto dec_ucounts; + } + refcount_set(&net->passive, 1); + net->ucounts = ucounts; + get_user_ns(user_ns); + + rv = down_read_killable((struct rw_semaphore *)p_test_var_1); + if (rv < 0) + goto put_userns; + + rv = test_func1(net, user_ns, 0, 0, 0, 0, 0, 0); + + up_read((struct rw_semaphore *)p_test_var_1); + + if (rv < 0) { +put_userns: + test_func6(net->key_domain, 0, 0, 0, 0, 0, 0, 0); + put_user_ns(user_ns); + test_func5(net, 0, 0, 0, 0, 0, 0, 0); +dec_ucounts: + dec_net_namespaces(ucounts); + return ERR_PTR(rv); + } + return net; +} +/* net_drop_ns */ +unsigned long do_net_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, unsigned long arg6, + unsigned long arg7, unsigned long arg8, bool *done, int name) +{ + *done = false; + if (name == 5 && strstr(get_one_func_name(name), functions[6])) { + *done = true; + adjust_net_drop_ns(arg1); + } + return 0; +} +#endif +#define DEFINE_NET_BEFORE_AFTER(name) \ +unsigned long net_scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, \ + unsigned long arg4, unsigned long arg5, unsigned long arg6) \ +{ \ +\ + unsigned long ret = 0; \ + return ret;\ +};\ +void net_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test) \ +{ \ +\ +}; +#if defined TK3 || defined TK2 +DEFINE_NET_BEFORE_AFTER(1) +DEFINE_NET_BEFORE_AFTER(7) +DEFINE_NET_BEFORE_AFTER(8) +DEFINE_NET_BEFORE_AFTER(9) +DEFINE_NET_BEFORE_AFTER(10) +DEFINE_NET_BEFORE_AFTER(11) +#endif +//DEFINE_NET_BEFORE_AFTER(1) +DEFINE_NET_BEFORE_AFTER(2) +DEFINE_NET_BEFORE_AFTER(3) +DEFINE_NET_BEFORE_AFTER(4) +DEFINE_NET_BEFORE_AFTER(5) +DEFINE_NET_BEFORE_AFTER(6) +//DEFINE_NET_BEFORE_AFTER(7) +//DEFINE_NET_BEFORE_AFTER(8) +//DEFINE_NET_BEFORE_AFTER(9) +//DEFINE_NET_BEFORE_AFTER(10) +//DEFINE_NET_BEFORE_AFTER(11) +DEFINE_NET_BEFORE_AFTER(12) +DEFINE_NET_BEFORE_AFTER(13) +DEFINE_NET_BEFORE_AFTER(14) +DEFINE_NET_BEFORE_AFTER(15) +DEFINE_NET_BEFORE_AFTER(16) +DEFINE_NET_BEFORE_AFTER(17) +DEFINE_NET_BEFORE_AFTER(18) +DEFINE_NET_BEFORE_AFTER(19) +DEFINE_NET_BEFORE_AFTER(20) +DEFINE_NET_BEFORE_AFTER(21) +DEFINE_NET_BEFORE_AFTER(22) +DEFINE_NET_BEFORE_AFTER(23) +DEFINE_NET_BEFORE_AFTER(24) +DEFINE_NET_BEFORE_AFTER(25) +DEFINE_NET_BEFORE_AFTER(26) +DEFINE_NET_BEFORE_AFTER(27) +DEFINE_NET_BEFORE_AFTER(28) +DEFINE_NET_BEFORE_AFTER(29) +DEFINE_NET_BEFORE_AFTER(30) +DEFINE_NET_BEFORE_AFTER(31) +DEFINE_NET_BEFORE_AFTER(32) +DEFINE_NET_BEFORE_AFTER(33) +DEFINE_NET_BEFORE_AFTER(34) +DEFINE_NET_BEFORE_AFTER(35) +DEFINE_NET_BEFORE_AFTER(36) +DEFINE_NET_BEFORE_AFTER(37) +DEFINE_NET_BEFORE_AFTER(38) +DEFINE_NET_BEFORE_AFTER(39) +DEFINE_NET_BEFORE_AFTER(40) +DEFINE_NET_BEFORE_AFTER(41) +DEFINE_NET_BEFORE_AFTER(42) +DEFINE_NET_BEFORE_AFTER(43) +DEFINE_NET_BEFORE_AFTER(44) +DEFINE_NET_BEFORE_AFTER(45) +DEFINE_NET_BEFORE_AFTER(46) +DEFINE_NET_BEFORE_AFTER(47) +DEFINE_NET_BEFORE_AFTER(48) +DEFINE_NET_BEFORE_AFTER(49) +DEFINE_NET_BEFORE_AFTER(50) diff --git a/ops/os_stat/os_stat/net_scene/net_scene.h b/ops/os_stat/os_stat/net_scene/net_scene.h new file mode 100644 index 0000000000000000000000000000000000000000..4274a2228409c48a71053bcb850c4a7f1c7fd523 --- /dev/null +++ b/ops/os_stat/os_stat/net_scene/net_scene.h @@ -0,0 +1,104 @@ +#ifndef _NET_SCENE_H +#define _NET_SCENE_H +/* + * debug kernel problem + * aurelianliu@tencent.com + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../version.h" +#ifdef TK2 +#include +#endif +#include "../scene_layer.h" +#include "../syms.h" +#include "../data_aware.h" +#include "net_trackers.h" +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 portid; + u32 dst_portid; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; + unsigned long state; + size_t max_recvmsg_len; + wait_queue_head_t wait; + bool bound; + bool cb_running; + int dump_done_errno; + struct netlink_callback cb; + struct mutex *cb_mutex; + struct mutex cb_def_mutex; + void (*netlink_rcv)(struct sk_buff *skb); + int (*netlink_bind)(struct net *net, int group); + void (*netlink_unbind)(struct net *net, int group); + struct module *module; + + struct rhash_head node; + struct rcu_head rcu; + struct work_struct work; +}; + +struct net_tracer { + struct net net; + struct net *net_ptr; + struct ref_tracker_dir refcnt_tracker; + struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not + * refcounted against netns + */ +}; +#ifndef TK5 +static inline void __netns_tracker_alloc(struct net *net, + netns_tracker *tracker, + bool refcounted, + gfp_t gfp) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_alloc(refcounted ? &((struct net_tracer *)net)->refcnt_tracker : + &((struct net_tracer *)net)->notrefcnt_tracker, + tracker, gfp); +#endif +} +static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, + gfp_t gfp) +{ + __netns_tracker_alloc(net, tracker, true, gfp); +} + +static inline void __netns_tracker_free(struct net *net, + netns_tracker *tracker, + bool refcounted) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_free(refcounted ? &((struct net_tracer *)net)->refcnt_tracker : + &((struct net_tracer *)net)->notrefcnt_tracker, tracker); +#endif +} + +static inline struct net *get_net_track(struct net *net, + netns_tracker *tracker, gfp_t gfp) +{ + //get_net(net); + netns_tracker_alloc(net, tracker, gfp); + return net; +} + +static inline void put_net_track(struct net *net, netns_tracker *tracker) +{ + __netns_tracker_free(net, tracker, true); + //put_net(net); +} +#endif + +#endif diff --git a/ops/os_stat/os_stat/net_scene/net_trackers.h b/ops/os_stat/os_stat/net_scene/net_trackers.h new file mode 100644 index 0000000000000000000000000000000000000000..bbcb3efedc33a31900f985023492c1688c0bd484 --- /dev/null +++ b/ops/os_stat/os_stat/net_scene/net_trackers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NET_TRACKERS_H +#define __NET_NET_TRACKERS_H +#include "ref_tracker.h" + +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +typedef struct ref_tracker *netns_tracker; +#else +typedef struct {} netns_tracker; +#endif + +#endif /* __NET_NET_TRACKERS_H */ diff --git a/ops/os_stat/os_stat/net_scene/ref_tracker.h b/ops/os_stat/os_stat/net_scene/ref_tracker.h new file mode 100644 index 0000000000000000000000000000000000000000..279cae450c26c8a4ab467d03173a04b8d702aaea --- /dev/null +++ b/ops/os_stat/os_stat/net_scene/ref_tracker.h @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#ifndef _LINUX_REF_TRACKER_H +#define _LINUX_REF_TRACKER_H +#ifndef TK2 +#include +#endif +#include +#include + +#define CONFIG_NET_NS_REFCNT_TRACKER +struct ref_tracker; + +struct ref_tracker_dir { +#ifdef CONFIG_REF_TRACKER + spinlock_t lock; + unsigned int quarantine_avail; + refcount_t untracked; + struct list_head list; /* List of active trackers */ + struct list_head quarantine; /* List of dead trackers */ +#endif +}; + +#ifdef CONFIG_REF_TRACKER +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ + INIT_LIST_HEAD(&dir->list); + INIT_LIST_HEAD(&dir->quarantine); + spin_lock_init(&dir->lock); + dir->quarantine_avail = quarantine_count; + refcount_set(&dir->untracked, 1); +} + +void ref_tracker_dir_exit(struct ref_tracker_dir *dir); + +void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit); + +int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, gfp_t gfp); + +int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp); + +#else /* CONFIG_REF_TRACKER */ + +static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, + unsigned int quarantine_count) +{ +} + +static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir) +{ +} + +static inline void ref_tracker_dir_print(struct ref_tracker_dir *dir, + unsigned int display_limit) +{ +} + +static inline int ref_tracker_alloc(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp, + gfp_t gfp) +{ + return 0; +} + +static inline int ref_tracker_free(struct ref_tracker_dir *dir, + struct ref_tracker **trackerp) +{ + return 0; +} + +#endif + +#endif /* _LINUX_REF_TRACKER_H */ diff --git a/ops/os_stat/os_stat/net_scene/slub_scan.c b/ops/os_stat/os_stat/net_scene/slub_scan.c new file mode 100644 index 0000000000000000000000000000000000000000..faa8cea98c83be148624671f0b320681d3470d31 --- /dev/null +++ b/ops/os_stat/os_stat/net_scene/slub_scan.c @@ -0,0 +1,400 @@ +#include +#include +#include +#include +#include +#include +#include +#include "../version.h" +#ifdef TK4_NEW_NEW +#include +#else +#include "slab.h" +#endif +#include "../syms.h" +#include "../scene_layer.h" + +#if defined TK4_NEW_NEW || defined TK5 +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} +#endif + +struct list_head __slab_caches; +struct kmemcache_stat { + struct list_head list; + struct kmem_cache *slab; + unsigned long cnt; + void (*print_obj)(struct kmem_cache *, void *); + void *priv; +}; + +void *fixup_red_left(struct kmem_cache *s, void *p) +{ + p += s->red_left_pad; + + return p; +} + +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) + +void print_tcp(struct kmem_cache *s, void *p) +{ + struct inet_sock *inet = (struct inet_sock *)p; + struct sock *sk = (struct sock *)p; + struct net *net = sock_net(sk); + trace_printk("##### %pI4 -> %pI4 %u %u %u %u %u %u %u %u %u %u %p\n", + &inet->inet_daddr, &inet->inet_rcv_saddr, + ntohs(inet->inet_dport),ntohs(inet->inet_sport), + refcount_read(&sk->sk_refcnt), sk->sk_shutdown, sk->sk_net_refcnt, + tcp_sk(sk)->snd_cwnd, sk->sk_state, sk->sk_family, + refcount_read(&sk->sk_wmem_alloc), sk->sk_kern_sock, net); +} + +void print_tcp6(struct kmem_cache *s, void *p) +{ + struct inet_sock *inet = (struct inet_sock *)p; + struct sock *sk = (struct sock *)p; + struct net *net = sock_net(sk); + trace_printk("##### %pI6c -> %pI6c %u %u %u %u %u %u %u %u %u %p\n", + &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr, + ntohs(inet->inet_sport), ntohs(inet->inet_dport), + refcount_read(&sk->sk_refcnt), sk->sk_shutdown, sk->sk_net_refcnt, + tcp_sk(sk)->snd_cwnd, sk->sk_state, sk->sk_family, + refcount_read(&sk->sk_wmem_alloc), net); +} + +void print_udp(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct inet_sock *inet = (struct inet_sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### %pI4 -> %pI4 %u %u %u %u %u %u %u %u %p\n", + &inet->inet_daddr, &inet->inet_rcv_saddr, + ntohs(inet->inet_dport),ntohs(inet->inet_sport), + refcount_read(&sk->sk_refcnt), sk->sk_shutdown, sk->sk_net_refcnt, + sk->sk_state, sk->sk_family, refcount_read(&sk->sk_wmem_alloc), net); +} + +void print_udp6(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct inet_sock *inet = (struct inet_sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### %pI6c -> %pI6c %u %u %u %u %u %u %u %u %p\n", + &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr, + ntohs(inet->inet_sport), ntohs(inet->inet_dport), + refcount_read(&sk->sk_refcnt), sk->sk_shutdown, sk->sk_net_refcnt, + sk->sk_state, sk->sk_family, refcount_read(&sk->sk_wmem_alloc), net); +} + +void print_ping(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### ping %u %u %u %p\n", + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, sk->sk_kern_sock, net); +} + +void print_unix(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct net *net = sock_net(sk); + + if (refcount_read(&sk->sk_refcnt) == 1 && sk->sk_net_refcnt == 1 && sk->sk_state == 7) { + struct unix_sock *u = unix_sk(sk); + int has_addr = u->addr == NULL ? 0 : 1; + trace_printk("1 ##### unix ref %u sk_net_refcnt %u sk_kern_sock %u sk_state %u " + "sk_type %u death %u shutdown %u sk_err %u wmem_alloc %u has_addr %u %pK %p, " + "unix sock:%lx, hash:%d" +#if defined TK4_NEW || defined TK5 + "pid:%d" +#endif + "\n", + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, sk->sk_kern_sock, sk->sk_state, + sk->sk_type, sock_flag(sk, SOCK_DEAD), sk->sk_shutdown, sk->sk_err, + refcount_read(&sk->sk_wmem_alloc), has_addr, p, net, u, sk_unhashed(sk) +#if defined TK4_NEW || defined TK5 + , sk->pid +#endif + ); + if (u) { + trace_printk("5 ##### %p, u->addr:%p\n", u->path.dentry, u->addr); + if (u->addr) + trace_printk("5 ##### u->addr:%s\n", u->addr->name[0].sun_path); + if (u->path.dentry) + trace_printk("6 ##### %p, %s", u->path.dentry, u->path.dentry->d_iname); + } + if (sk->sk_socket) { + trace_printk("7 ##### %p, type:%d, state:%d, file:%p\n", sk->sk_socket, + sk->sk_socket->type, sk->sk_socket->state, + sk->sk_socket->file); + if (sk->sk_socket->file) + trace_printk("8 ##### %s\n", sk->sk_socket->file->f_path.dentry->d_iname); + } + } + + if (refcount_read(&sk->sk_refcnt) == 2 && sk->sk_net_refcnt == 0 && sk->sk_state == 1) { + struct unix_sock *u = unix_sk(sk); + int has_addr = u->addr == NULL ? 0 : 1; + trace_printk("2 ##### unix ref %u sk_net_refcnt %u sk_kern_sock %u sk_state %u " + "sk_type %u death %u shutdown %u sk_err %u wmem_alloc %u has_addr %u %pK %p, " + "unix sock:%lx, %d" +#if defined TK4_NEW || defined TK5 + "pid:%d" +#endif + "\n", + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, sk->sk_kern_sock, sk->sk_state, + sk->sk_type, sock_flag(sk, SOCK_DEAD), sk->sk_shutdown, sk->sk_err, + refcount_read(&sk->sk_wmem_alloc), has_addr, p, net, u, sk_unhashed(sk) +#if defined TK4_NEW || defined TK5 + , sk->pid +#endif + ); + if (u) { + trace_printk("3 ##### %p\n", u->path.dentry); + if (u->path.dentry) + trace_printk("4 ##### %p, %s", u->path.dentry, u->path.dentry->d_iname); + } + if (sk->sk_socket) { + trace_printk("7 ##### %p, type:%d, state:%d, file:%p\n", + sk->sk_socket, sk->sk_socket->type, sk->sk_socket->state, + sk->sk_socket->file); + if (sk->sk_socket->file) + trace_printk("8 ##### %s\n", sk->sk_socket->file->f_path.dentry->d_iname); + } + } +} + +void print_ping6(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### ping6 %u %u %p\n", + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, net); +} + +void print_raw(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct inet_sock *inet = (struct inet_sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### raw %pI4 -> %pI4 %u %u %u %u %u %u %u %u %u %pK %p\n", + &inet->inet_daddr, &inet->inet_rcv_saddr, + ntohs(inet->inet_dport), inet->inet_num, + sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, + atomic_read(&sk->sk_drops), sk->sk_kern_sock, p, net); +} + +void print_raw6(struct kmem_cache *s, void *p) +{ + struct sock *sk = (struct sock *)p; + struct inet_sock *inet = (struct inet_sock *)p; + struct net *net = sock_net(sk); + + trace_printk("##### raw6 %pI6 -> %pI6 %u %u %u %u %p\n", + &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr, + ntohs(inet->inet_dport),ntohs(inet->inet_sport), + refcount_read(&sk->sk_refcnt), sk->sk_net_refcnt, net); +} + +void print_object(struct kmemcache_stat *ment, void *p) +{ + ment->cnt ++; + if (ment->print_obj) { + ment->print_obj(ment->slab, p); + } +} +#ifdef TK5 +static inline struct slab *slab_cache(struct page *page) +{ + struct folio *folio; + struct slab *slab; + + folio = page_folio(page); + if (!folio) + return NULL; + if (!folio_test_slab(folio)) + return NULL; + + slab = folio_slab(folio); + + return slab; +} +#endif +unsigned long show_objects(struct page *page) +{ + struct kmem_cache *s; + struct kmemcache_stat *entry; + struct slab *slab; + void *p, *addr; + int found = 0; + +#ifdef TK5 + slab = slab_cache(page); + if(!slab) + return 1; + s = slab->slab_cache; +#else + s = page->slab_cache; +#endif + + list_for_each_entry (entry, &__slab_caches, list) { + if (entry->slab == s) { + found = 1; + break; + } + } + + if (!found) { + return 1; + } +#ifdef TK5 + trace_printk("[%s] objs %u, inuse %u \n", + entry->slab->name, slab->objects, slab->inuse); +#else + trace_printk("[%s] objs %u, inuse %u \n", + entry->slab->name, page->objects, page->inuse); +#endif + addr = page_address(page); +#ifdef TK5 + for_each_object (p, s, addr, slab->objects) { +#else + for_each_object (p, s, addr, page->objects) { +#endif + print_object(entry, p); + } + // scan n pages already + return (PAGE_ALIGN((unsigned long)p) - (unsigned long)addr)/PAGE_SIZE; + return 0; +} + +static void slab_scan(void) +{ + int i = 0; + + for_each_online_node(i) { + unsigned long spfn, epfn, pfn; + + spfn= node_start_pfn(i); + epfn = node_end_pfn(i); + for (pfn = spfn; pfn < epfn;) { + struct page *page; + + if (!pfn_valid(pfn)) { + pfn++; + continue; + } + + page = pfn_to_page(pfn); + + if(!PageSlab(page)) { + pfn ++; + continue; + } + pfn += show_objects(page); + } + } +} + +int dump_slab_obj_start(void) +{ + struct kmem_cache *entry; + + if (!_slab_caches) { + return -1; + } + + INIT_LIST_HEAD(&__slab_caches); + list_for_each_entry (entry, _slab_caches, list) { + struct kmemcache_stat *stat; + if (strcmp(cache_name(entry), "UNIX")) + continue; + + stat = kmalloc(sizeof(struct kmemcache_stat), GFP_KERNEL); + stat->cnt = 0; + stat->print_obj = NULL; + INIT_LIST_HEAD(&stat->list); + list_add(&stat->list, &__slab_caches); + stat->slab = entry; + + if (!strcmp(cache_name(entry), "TCP")) + stat->print_obj = print_tcp; + + if (!strcmp(cache_name(entry), "TCPv6")) + stat->print_obj = print_tcp6; + + if (!strcmp(cache_name(entry), "RAW")) + stat->print_obj = print_raw; + + if (!strcmp(cache_name(entry), "RAWv6")) + stat->print_obj = print_raw6; + + if (!strcmp(cache_name(entry), "PING")) + stat->print_obj = print_ping; + + if (!strcmp(cache_name(entry), "PINGv6")) + stat->print_obj = print_ping6; + + if (!strcmp(cache_name(entry), "UDP")) + stat->print_obj = print_udp; + + if (!strcmp(cache_name(entry), "UDPv6")) + stat->print_obj = print_udp6; + + if (!strcmp(cache_name(entry), "UNIX")) + stat->print_obj = print_unix; + } + slab_scan(); + + return 0; +} + +void dump_slab_obj_end(void) +{ + struct kmemcache_stat *ment, *tmp; + unsigned long total = 0; + + list_for_each_entry_safe (ment, tmp, &__slab_caches, list) { + total += ment->cnt; + list_del(&ment->list); + kfree(ment); + } + trace_printk("total objs: %lu\n", total); +} +int sysctl_slub_enable_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!write) + return ret; + + if (sysctl_module_enable_slub == 1) { + init_scene(); + dump_slab_obj_start(); + dump_slab_obj_end(); + } + + + if (sysctl_module_enable_slub == 0) + exit_scene(); + + return ret; +} diff --git a/ops/os_stat/os_stat/parse_paramter.c b/ops/os_stat/os_stat/parse_paramter.c new file mode 100644 index 0000000000000000000000000000000000000000..67c4ff9e2385ddaaa348c877531ee0146e2e85ee --- /dev/null +++ b/ops/os_stat/os_stat/parse_paramter.c @@ -0,0 +1,87 @@ +/* + * Kernel dynamic hooks based on ftrace + * aurelianliu@tencent.com + */ +#include "data_aware.h" + +unsigned int sysctl_module_offset_enable = 0; +unsigned int sysctl_module_offset1 = 0; // var offset for 1st member +unsigned int sysctl_module_offset2 = 0; // var offset for 2dc member +unsigned int sysctl_module_offset3 = 0; // var offset for 3rd member +unsigned int sysctl_module_which_parameter = 0; // var offset for 3rd member +bool get_parameter(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + unsigned long addr; + unsigned long *pointer = 0; + unsigned long *member = 0; + bool ret = true; + + if (sysctl_module_offset_enable == 0) + return ret; + if (strstr(current->comm, "cat")) + return ret; + + if (strcmp(sysctl_module_process_comm, "0") && !strstr(sysctl_module_process_comm, current->comm)) + return true; + + addr = arg1; + + switch (sysctl_module_which_parameter) { + case 1: + addr = arg1; + break; + case 2: + addr = arg2; + break; + case 3: + addr = arg3; + break; + case 4: + addr = arg4; + break; + case 5: + addr = arg5; + break; + case 6: + addr = arg6; + break; + default: + break; + } + ret = false; + if (addr == 0) + goto out; + addr += sysctl_module_offset1; + pointer = (unsigned long *)addr; + member = pointer; + + if (sysctl_module_offset2 != 0) { + if (pointer == NULL || *pointer == 0) + goto out; + pointer = *pointer + sysctl_module_offset2; + } + + if (sysctl_module_offset3 != 0) { + if (pointer == NULL || *pointer == 0) + goto out; + pointer = *pointer + sysctl_module_offset3; + } + ret = true; + + if (strstr(show_parameter_type, "int")) + if ((addr & 0xffff000000000000) == 0) + sprintf(show_parameter_val, "val:%x, proc:%s, pid:%d \0", pointer, current->comm, current->pid); + else + sprintf(show_parameter_val, "val:%x, proc:%s, pid:%d \0", *pointer, current->comm, current->pid); + else if (strstr(show_parameter_type, "long")) + if ((addr & 0xffff000000000000) == 0) + sprintf(show_parameter_val, "val:%lx, proc:%s, pid:%d \0", pointer, current->comm, current->pid); + else + sprintf(show_parameter_val, "val:%lx, proc:%s, pid:%d \0", *pointer, current->comm, current->pid); + else if (strstr(show_parameter_type, "char")) { + sprintf(show_parameter_val, "val:%s, proc:%s, pid:%d \0", pointer, current->comm, current->pid); + } +out: + return ret; +} diff --git a/ops/os_stat/os_stat/patch/debug_0402_001.patch b/ops/os_stat/os_stat/patch/debug_0402_001.patch new file mode 100644 index 0000000000000000000000000000000000000000..cc8e43632fca28570953e24fc979810a349cc5f5 --- /dev/null +++ b/ops/os_stat/os_stat/patch/debug_0402_001.patch @@ -0,0 +1,36 @@ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 764a0000df94..ee4dfff5127e 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1550,13 +1550,14 @@ static inline void update_idle_core(struct rq *rq) + static inline void update_idle_core(struct rq *rq) { } + #endif + +-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++//DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +-#define this_rq() this_cpu_ptr(&runqueues) ++#define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(runqueues) + #define task_rq(p) cpu_rq(task_cpu(p)) + #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +-#define raw_rq() raw_cpu_ptr(&runqueues) ++//#define raw_rq() raw_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(runqueues) + + #ifdef CONFIG_FAIR_GROUP_SCHED + static inline struct task_struct *task_of(struct sched_entity *se) +@@ -1934,10 +1935,10 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + return sd; + } + +-DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); ++//DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); +-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); ++//DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/ops/os_stat/os_stat/patch/debug_xfs_demo.patch b/ops/os_stat/os_stat/patch/debug_xfs_demo.patch new file mode 100644 index 0000000000000000000000000000000000000000..8784065de5280db90eb46f03f0b50a97dba2c4ad --- /dev/null +++ b/ops/os_stat/os_stat/patch/debug_xfs_demo.patch @@ -0,0 +1,309 @@ +diff --git a/ops/os_stat/os_stat/Makefile b/ops/os_stat/os_stat/Makefile +index 90337f4..05fd717 100644 +--- a/ops/os_stat/os_stat/Makefile ++++ b/ops/os_stat/os_stat/Makefile +@@ -2,6 +2,12 @@ + # Makefile for sys module + # + ++ifeq (${version}, 6.6) ++KBUILD_CFLAGS+=-I/data/tkernel5/fs/xfs/libxfs/ ++else ++KBUILD_CFLAGS+=-I/data/tkernel4/fs/xfs/libxfs/ ++endif ++ + obj-m += os_aware.o + + os_aware-y := main.o data_aware.o syms.o hook_tk5.o hook.o kretprobe_prehook.o kprobe_prehook.o sysctl.o ftrace_hook.o func_pointer_table.o func_struct_table.o +@@ -15,6 +21,6 @@ KERNELDIR=`pwd`/include + MODULEDIR=`pwd` + + default: +- /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) modules ++ /usr/bin/make -C $(KBUILD_CFLAGS) $(KERNELDIR) M=$(MODULEDIR) modules + clean: + /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) clean +diff --git a/ops/os_stat/os_stat/ftrace_hook.c b/ops/os_stat/os_stat/ftrace_hook.c +index 1a74534..6faf307 100644 +--- a/ops/os_stat/os_stat/ftrace_hook.c ++++ b/ops/os_stat/os_stat/ftrace_hook.c +@@ -12,12 +12,37 @@ + #include + #include + #include ++#include + + #include "version.h" + #include "hook.h" + #include "data_aware.h" + #include "kprobe_prehook.h" +- ++#include "include/fs/xfs/xfs_linux.h" ++#include "include/fs/xfs/libxfs/xfs_types.h" ++#include "include/fs/xfs/libxfs/xfs_cksum.h" ++#include "include/fs/xfs/libxfs/xfs_fs.h" ++#include "include/fs/xfs/xfs_buf.h" ++#include "include/fs/xfs/xfs_trans.h" ++#include "include/fs/xfs/libxfs/xfs_log_format.h" ++#include "include/fs/xfs/xfs_log_priv.h" ++#include "include/fs/xfs/xfs_log.h" ++#include "include/fs/xfs/xfs_trans_priv.h" ++#include "include/fs/xfs/xfs_extfree_item.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/libxfs/xfs_format.h" ++#include "include/fs/xfs/libxfs/xfs_trans_resv.h" ++#include "include/fs/xfs/libxfs/xfs_shared.h" ++#include "include/fs/xfs/xfs_mount.h" ++#include "include/fs/xfs/libxfs/xfs_inode_buf.h" ++#include "include/fs/xfs/xfs_inode.h" ++#include "include/fs/xfs/libxfs/xfs_btree.h" ++#include "include/fs/xfs/xfs_buf_item.h" ++#include "include/fs/xfs/libxfs/xfs_defer.h" ++#include "include/fs/xfs/kmem.h" ++ ++#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ + DEFINE_MUTEX(hook_func); + #define HOOK_FUNC_NUM 100 + struct ftrace_verify_func +@@ -48,8 +73,8 @@ char ftrace_hook_name[NAME_MAX]; + struct work_struct ftrace_work; + struct work_struct ftrace_work_init; + unsigned long (*p__test_func)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, +- unsigned long arg5, unsigned long arg6); +-#define __test_func(arg1, arg2, arg3, arg4, arg5, arg6) p__test_func(arg1, arg2, arg3, arg4, arg5, arg6) ++ unsigned long arg5, unsigned long arg6, struct xfs_bmbt_irec imap, struct xfs_bmbt_irec arg7, unsigned long arg8); ++#define __test_func(arg1, arg2, arg3, arg4, arg5, arg6, imap, arg7, arg8) p__test_func(arg1, arg2, arg3, arg4, arg5, arg6, imap, arg7, arg8) + + struct open_flags { + int open_flag; +@@ -64,27 +89,109 @@ char func_pointer_name[NAME_MAX]; + char printk_name_first[NAME_MAX]; + char printk_name_last[NAME_MAX]; + static int hook_count; ++static inline xfs_fileoff_t xfs_iomap_end_fsb( ++ struct xfs_mount *mp, ++ loff_t offset, ++ loff_t count) ++{ ++ return min(XFS_B_TO_FSB(mp, offset + count), ++ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); ++} ++void debug(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, ++ unsigned long arg5, unsigned long arg6, struct xfs_bmbt_irec imap, struct xfs_bmbt_irec cmap, unsigned long arg8) ++{ ++ struct inode *inode = (struct inode *)arg1; ++ loff_t offset = arg2; ++ loff_t count = arg3; ++ unsigned flags = arg4; ++ struct iomap *iomap = (struct iomap *)arg5; ++ struct xfs_inode *ip = XFS_I(inode); ++ struct xfs_mount *mp = (struct xfs_mount *)arg6; ++ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); ++ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); ++ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); ++ xfs_fsblock_t prealloc_blocks = 0; ++ ++ if ((inode->kabi_reserved1 != 1) || !(strstr(current->comm, "fsstress"))) ++ return; ++ ++ if (arg8 == 1) { ++ pr_info("%s, %d, %ld, %ld, flags:%d, startoff:%ld, %d, proc:%s, %d, pos:%d", __func__, __LINE__, offset_fsb, eof_fsb, ++ (flags & IOMAP_ZERO), imap.br_startoff, ++ isnullstartblock(imap.br_startblock), current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 2) { ++ pr_info("%s, %d, offset:%ld, count:%ld, start:%lu, count:%lu, proc:%s %d, eof_fsb:%lu, imap:%lu, %d, pos:%d", ++ __func__, __LINE__, offset_fsb, count, cmap.br_startoff, ++ cmap.br_blockcount, current->comm, current->pid, ++ eof_fsb, imap.br_startoff, isnullstartblock(imap.br_startblock), (int)arg8); ++ } ++ if (arg8 == 3) { ++ pr_info("%s, %d, offset:%ld, count:%ld, start:%lu, count:%lu, proc:%s %d, eof_fsb:%lu, imap:%lu, %d, pos:%d", ++ __func__, __LINE__, offset_fsb, count, cmap.br_startoff, ++ cmap.br_blockcount, current->comm, current->pid, ++ eof_fsb, imap.br_startoff, isnullstartblock(imap.br_startblock), (int)arg8); ++ } ++ if (arg8 == 4) { ++ pr_info("%s, %d, offset:%lu, %lu, size:%lu, proc:%s, %d, pos:%d", __func__, __LINE__, ++ offset, count, XFS_ISIZE(ip), current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 5) { ++ pr_info("%s, %d, offset:%lu, %lu, size:%lu, proc:%s, %d, pos:%d", __func__, __LINE__, ++ offset, count, XFS_ISIZE(ip), current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 6) { ++ pr_info("%s, %d, offset:%lu, %lu, size:%lu, pre:%ld, proc:%s, %d, pos:%d", __func__, __LINE__, ++ offset, count, XFS_ISIZE(ip), prealloc_blocks, current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 7) { ++ pr_info("%s, %d, imap:%lu, count:%ld, %ld, offset:%lu, end:%lu, delta:%lu, pre:%lu, cmap start:%ld, len:%ld, count:%ld, isnull:%d, flags:%d, proc:%s, %d, pos:%d", __func__, __LINE__, ++ imap.br_startoff, imap.br_blockcount, imap.br_startoff + imap.br_blockcount, offset_fsb, end_fsb, end_fsb - offset_fsb, prealloc_blocks, cmap.br_startoff, ++ cmap.br_startoff - offset_fsb, cmap.br_blockcount, isnullstartblock(cmap.br_startblock), flags & IOMAP_ZERO, current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 8) { ++ pr_info("%s, %d, offset:%lu, end:%lu, delta:%lu, pre:%lu, proc:%s, %d, pos:%d", __func__, __LINE__, ++ offset_fsb, end_fsb, end_fsb - offset_fsb, prealloc_blocks, current->comm, current->pid, (int)arg8); ++ } ++ if (arg8 == 9) { ++ pr_info("%s, %d, imap:%lu, offset:%lu, eof:%lu, end:%lu, delta:%lu, pre:%lu, cmap start:%ld, count:%ld, isnull:%d, flags:%d, proc:%s, %d, pos:%d", __func__, __LINE__, ++ imap.br_startoff, offset_fsb, eof_fsb, end_fsb, end_fsb - offset_fsb, prealloc_blocks, cmap.br_startoff, ++ cmap.br_blockcount, isnullstartblock(cmap.br_startblock), flags & IOMAP_ZERO, current->comm, current->pid, (int)arg8); ++ ++ } ++ ++} + unsigned long test_func(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, +- unsigned long arg5, unsigned long arg6) ++ unsigned long arg5, unsigned long arg6, struct xfs_bmbt_irec imap, struct xfs_bmbt_irec arg7, unsigned long arg8) + { + ssize_t ret; + + percpu_counter_inc(&ftrace_patch_num); + ++ debug(arg1, arg2, arg3, arg4, arg5, arg6, imap, arg7, arg8); ++ + print_func_name((void *)arg1, (void *)arg2, (void *)arg3, func_pointer, func_pointer_name); + print_info((void *)arg1, (void *)arg2, (void *)arg3); + + stat_func_enter(NULL); +- ret = __test_func(arg1, arg2, arg3, arg4, arg5, arg6); ++ ret = __test_func(arg1, arg2, arg3, arg4, arg5, arg6, imap, arg7, arg8); + stat_func_exit(NULL); + + percpu_counter_dec(&ftrace_patch_num); + + return ret; + } ++//iomap_write_begin + unsigned long print_before_1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ struct iomap_iter *iter = (struct iomap_iter *)arg1; ++ unsigned long pos = arg2; ++ unsigned long len = arg3; ++ ++ if (pos + len > iter->iomap.offset + iter->iomap.length) ++ pr_info("pos:%ld, len:%ld, offset:%ld, len:%ld, %ld, %ld, proc:%s, %d", pos, len, iter->iomap.offset, iter->iomap.length, ++ pos + len, iter->iomap.offset + iter->iomap.length, current->comm, current->pid); + return 0; + + } +@@ -92,28 +199,51 @@ void print_after_1(ssize_t ret, unsigned long arg1, unsigned long arg2, unsigned + unsigned long arg5, unsigned long arg6) + { + } ++//xfs_buffered_write_iomap_begin + unsigned long print_before_2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ struct inode *inode = (struct inode *)arg1; ++ struct xfs_inode *ip = XFS_I(inode); ++ //pr_info("xfs_buffered_write_iomap_begin start:%p, %p, %s, %d\n", inode, ip, ++ // current->comm, current->pid); + return 0; + }; + void print_after_2(ssize_t ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ struct inode *inode = (struct inode *)arg1; ++ struct xfs_inode *ip = XFS_I(inode); ++ //pr_info("xfs_buffered_write_iomap_begin end:%p, %p, %s, %d\n", inode, ip, ++ // current->comm, current->pid); + }; ++//xfs_bmap_extsize_align + unsigned long print_before_3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ xfs_extlen_t *lenp = (xfs_extlen_t *)arg5; ++ pr_info("xfs_bmap_extsize_align before:%ld, %s, %d\n", *lenp, ++ current->comm, current->pid); + return 0; + }; + void print_after_3(ssize_t ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ xfs_extlen_t *lenp = (xfs_extlen_t *)arg5; ++ pr_info("xfs_bmap_extsize_align after:%ld, %s, %d\n", *lenp, ++ current->comm, current->pid); + }; +- ++//xfs_bmapi_reserve_delalloc + unsigned long print_before_4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { ++ xfs_fileoff_t off = arg3; ++ xfs_filblks_t len = arg4; ++ struct xfs_bmbt_irec *got = (struct xfs_bmbt_irec *)arg6; ++ xfs_filblks_t alen = XFS_FILBLKS_MIN(len, got->br_startoff - off); ++ ++ pr_info("xfs_bmapi_reserve_delalloc after:%ld, %ld, %ld, proc:%s, %d\n", alen, len, got->br_startoff - off, ++ current->comm, current->pid); + return 0; + }; + void print_after_4(ssize_t ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, +@@ -132,20 +262,20 @@ void print_after_5(ssize_t ret, unsigned long arg1, unsigned long arg2, unsigned + + #define DEFINE_TEST(name) \ + unsigned long (*p__test_func##name)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ +- unsigned long arg5, unsigned long arg6); \ ++ unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, unsigned long arg9, unsigned long arg10); \ + unsigned long test_func##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ +- unsigned long arg5, unsigned long arg6) \ ++ unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, unsigned long arg9, unsigned long arg10) \ + {\ + ssize_t ret; \ + unsigned long test; \ + \ + percpu_counter_inc(&ftrace_patch_num); \ + \ +- test = print_before_##name(arg1, arg2, arg3, arg4, arg5, arg6); \ ++ test = print_before_##name(arg1, arg2, arg3, arg4, arg10, arg6); \ + \ +- ret = p__test_func##name(arg1, arg2, arg3, arg4, arg5, arg6); \ ++ ret = p__test_func##name(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); \ + \ +- print_after_##name(ret, arg1, arg2, arg3, arg4, arg5, test); \ ++ print_after_##name(ret, arg1, arg2, arg3, arg4, arg10, test); \ + \ + percpu_counter_dec(&ftrace_patch_num); \ + \ +diff --git a/ops/os_stat/os_stat/func_pointer_table.c b/ops/os_stat/os_stat/func_pointer_table.c +index 209a6d1..32f34c6 100644 +--- a/ops/os_stat/os_stat/func_pointer_table.c ++++ b/ops/os_stat/os_stat/func_pointer_table.c +@@ -15,7 +15,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; +diff --git a/ops/os_stat/os_stat/func_pointer_table_6_6.c b/ops/os_stat/os_stat/func_pointer_table_6_6.c +index 1f97ce3..c0ffcf5 100644 +--- a/ops/os_stat/os_stat/func_pointer_table_6_6.c ++++ b/ops/os_stat/os_stat/func_pointer_table_6_6.c +@@ -16,7 +16,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; +diff --git a/ops/os_stat/os_stat/func_struct_table.c b/ops/os_stat/os_stat/func_struct_table.c +index b1c8c91..f10b941 100644 +--- a/ops/os_stat/os_stat/func_struct_table.c ++++ b/ops/os_stat/os_stat/func_struct_table.c +@@ -14,7 +14,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; diff --git a/ops/os_stat/os_stat/patch/debug_xfs_demo_002.patch b/ops/os_stat/os_stat/patch/debug_xfs_demo_002.patch new file mode 100644 index 0000000000000000000000000000000000000000..b2b0ed8442f75886d5b8469a5f569cced1048cd1 --- /dev/null +++ b/ops/os_stat/os_stat/patch/debug_xfs_demo_002.patch @@ -0,0 +1,138 @@ +diff --git a/ops/os_stat/os_stat/Makefile b/ops/os_stat/os_stat/Makefile +index 90337f4..05fd717 100644 +--- a/ops/os_stat/os_stat/Makefile ++++ b/ops/os_stat/os_stat/Makefile +@@ -2,6 +2,12 @@ + # Makefile for sys module + # + ++ifeq (${version}, 6.6) ++KBUILD_CFLAGS+=-I/data/tkernel5/fs/xfs/libxfs/ ++else ++KBUILD_CFLAGS+=-I/data/tkernel4/fs/xfs/libxfs/ ++endif ++ + obj-m += os_aware.o + + os_aware-y := main.o data_aware.o syms.o hook_tk5.o hook.o kretprobe_prehook.o kprobe_prehook.o sysctl.o ftrace_hook.o func_pointer_table.o func_struct_table.o +@@ -15,6 +21,6 @@ KERNELDIR=`pwd`/include + MODULEDIR=`pwd` + + default: +- /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) modules ++ /usr/bin/make -C $(KBUILD_CFLAGS) $(KERNELDIR) M=$(MODULEDIR) modules + clean: + /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) clean +diff --git a/ops/os_stat/os_stat/ftrace_hook.c b/ops/os_stat/os_stat/ftrace_hook.c +index b898c5b..ea6a54c 100644 +--- a/ops/os_stat/os_stat/ftrace_hook.c ++++ b/ops/os_stat/os_stat/ftrace_hook.c +@@ -17,6 +17,31 @@ + #include "hook.h" + #include "data_aware.h" + #include "kprobe_prehook.h" ++#include "include/fs/xfs/xfs_linux.h" ++#include "include/fs/xfs/libxfs/xfs_types.h" ++#include "include/fs/xfs/libxfs/xfs_cksum.h" ++#include "include/fs/xfs/libxfs/xfs_fs.h" ++#include "include/fs/xfs/xfs_buf.h" ++#include "include/fs/xfs/xfs_trans.h" ++#include "include/fs/xfs/libxfs/xfs_log_format.h" ++#include "include/fs/xfs/xfs_log_priv.h" ++#include "include/fs/xfs/xfs_log.h" ++#include "include/fs/xfs/xfs_trans_priv.h" ++#include "include/fs/xfs/xfs_extfree_item.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/libxfs/xfs_format.h" ++#include "include/fs/xfs/libxfs/xfs_trans_resv.h" ++#include "include/fs/xfs/libxfs/xfs_shared.h" ++#include "include/fs/xfs/xfs_mount.h" ++#include "include/fs/xfs/libxfs/xfs_inode_buf.h" ++#include "include/fs/xfs/xfs_inode.h" ++#include "include/fs/xfs/libxfs/xfs_btree.h" ++#include "include/fs/xfs/xfs_buf_item.h" ++#include "include/fs/xfs/libxfs/xfs_defer.h" ++#include "include/fs/xfs/kmem.h" ++ ++#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ + + #ifdef CONFIG_FUNCTION_TRACER + #define CC_USING_FENTRY +@@ -86,6 +111,10 @@ unsigned int sysctl_module_offset1 = 0; // var offset for 1st member + unsigned int sysctl_module_offset2 = 0; // var offset for 2dc member + unsigned int sysctl_module_offset3 = 0; // var offset for 3rd member + unsigned int sysctl_module_which_parameter = 0; // var offset for 3rd member ++unsigned long (*p__test_func1)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, ++ unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); ++unsigned long test_func1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, ++ unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); + bool debug_test_func(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) + { +@@ -177,6 +206,26 @@ unsigned long test_func(unsigned long arg1, unsigned long arg2, unsigned long ar + stat_func_enter(NULL); + ret = __test_func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + stat_func_exit(NULL); ++ if (1) { ++ struct xfs_buf *agibp; ++ struct xfs_trans *tp = (struct xfs_trans *)arg1; ++ struct xfs_mount *mp = tp->t_mountp; ++ struct xfs_agi *agi; ++ xfs_agino_t next_agino; ++ struct xfs_inode *ip = (struct xfs_inode *)arg2; ++ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); ++ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); ++ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; ++ ++ if (!p__test_func1) ++ goto out; ++ test_func1(mp, tp, agno, &agibp, 0, 0, 0, 0); ++ agi = XFS_BUF_TO_AGI(agibp); ++ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); ++ pr_info("----%d, %d", next_agino, agino); ++ ++ } ++out: + + if (!debug) + debug_test_func(arg1, arg2, arg3, arg4, arg5, arg6); +diff --git a/ops/os_stat/os_stat/func_pointer_table.c b/ops/os_stat/os_stat/func_pointer_table.c +index 8b3d471..9a14ec7 100644 +--- a/ops/os_stat/os_stat/func_pointer_table.c ++++ b/ops/os_stat/os_stat/func_pointer_table.c +@@ -16,7 +16,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; +diff --git a/ops/os_stat/os_stat/func_pointer_table_6_6.c b/ops/os_stat/os_stat/func_pointer_table_6_6.c +index 1f97ce3..c0ffcf5 100644 +--- a/ops/os_stat/os_stat/func_pointer_table_6_6.c ++++ b/ops/os_stat/os_stat/func_pointer_table_6_6.c +@@ -16,7 +16,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; +diff --git a/ops/os_stat/os_stat/func_struct_table.c b/ops/os_stat/os_stat/func_struct_table.c +index dd36991..db15925 100644 +--- a/ops/os_stat/os_stat/func_struct_table.c ++++ b/ops/os_stat/os_stat/func_struct_table.c +@@ -14,7 +14,7 @@ + #include "syms.h" + #include "./include/kernel/sched/sched.h" + #include "./include/drivers/target/target_core_file.h" +-#include "./include/drivers/block/loop.h" ++#include "./include_6_6/drivers/block/loop.h" + + #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) + extern unsigned int sysctl_module_debug; diff --git a/ops/os_stat/os_stat/patch/debug_xfs_template_0811.patch b/ops/os_stat/os_stat/patch/debug_xfs_template_0811.patch new file mode 100644 index 0000000000000000000000000000000000000000..bf174812909a022b48c21a6bebd4fbec18bff443 --- /dev/null +++ b/ops/os_stat/os_stat/patch/debug_xfs_template_0811.patch @@ -0,0 +1,48 @@ +diff --git a/ops/os_stat/os_stat/Makefile b/ops/os_stat/os_stat/Makefile +index afdf371..acedb7b 100644 +--- a/ops/os_stat/os_stat/Makefile ++++ b/ops/os_stat/os_stat/Makefile +@@ -24,6 +24,8 @@ KBUILD_CFLAGS+=-I/usr/lib/tencentos-tools/ops/os_stat/os_stat/include/fs/ext4_ol + KBUILD_CFLAGS+=-I/usr/lib/tencentos-tools/ops/os_stat/os_stat/include/mm/mm_le_0011 + endif + ++KBUILD_CFLAGS+=-I/usr/lib/tencentos-tools/ops/os_stat/os_stat/include/fs/xfs/libxfs/ ++ + default: + /usr/bin/make -C $(KERNELDIR) M=$(MODULEDIR) modules + clean: +diff --git a/ops/os_stat/os_stat/scene_template.c b/ops/os_stat/os_stat/scene_template.c +index f64c9e3..a382784 100644 +--- a/ops/os_stat/os_stat/scene_template.c ++++ b/ops/os_stat/os_stat/scene_template.c +@@ -5,7 +5,29 @@ + + #include "scene_layer.h" + #include "data_aware.h" +- ++#include "include/fs/xfs/xfs_linux.h" ++#include "include/fs/xfs/libxfs/xfs_types.h" ++#include "include/fs/xfs/libxfs/xfs_cksum.h" ++#include "include/fs/xfs/libxfs/xfs_fs.h" ++#include "include/fs/xfs/xfs_buf.h" ++#include "include/fs/xfs/xfs_trans.h" ++#include "include/fs/xfs/libxfs/xfs_log_format.h" ++#include "include/fs/xfs/xfs_log_priv.h" ++#include "include/fs/xfs/xfs_log.h" ++#include "include/fs/xfs/xfs_trans_priv.h" ++#include "include/fs/xfs/xfs_extfree_item.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/mrlock.h" ++#include "include/fs/xfs/libxfs/xfs_format.h" ++#include "include/fs/xfs/libxfs/xfs_trans_resv.h" ++#include "include/fs/xfs/libxfs/xfs_shared.h" ++#include "include/fs/xfs/xfs_mount.h" ++#include "include/fs/xfs/libxfs/xfs_inode_buf.h" ++#include "include/fs/xfs/xfs_inode.h" ++#include "include/fs/xfs/libxfs/xfs_btree.h" ++#include "include/fs/xfs/xfs_buf_item.h" ++#include "include/fs/xfs/libxfs/xfs_defer.h" ++#include "include/fs/xfs/kmem.h" + static char functions[][32] = { + "function_name1", //change your function which to debug + "function_name2", //change your function which to debug diff --git a/ops/os_stat/os_stat/scene_layer.c b/ops/os_stat/os_stat/scene_layer.c new file mode 100644 index 0000000000000000000000000000000000000000..3d9dc762719be4b0988efc8707750bf0f0703f52 --- /dev/null +++ b/ops/os_stat/os_stat/scene_layer.c @@ -0,0 +1,209 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#include "scene_layer.h" +#include "data_aware.h" + +unsigned long sysctl_enable_debug = 0; +unsigned int sysctl_module_enable_slub = 0;// enable slub debug by sysctl +unsigned int sysctl_module_enable_irq = 0;// enable slub debug by sysctl +int *stat_write = NULL; +int *stat_process = NULL; +char *stat_proc_comm[IO_ARRAY_ITEM]; +int *stat_add = NULL; +int *stat_add_process = NULL; +char *stat_add_proc_comm[IO_ARRAY_ITEM]; +int cur_total_index; +int cur_total_add_index; + +bool scene_enabled(void) +{ + if (sysctl_trace_type <= SIGRTMAX + 2 || sysctl_trace_type > 100) + return false; + + return true; +} + +bool debug_enabled(void) +{ + if (sysctl_trace_type == 100) + return true; + + return false; +} + +bool scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6) +{ + bool ret, done; + + ret = do_scene_template_before(arg1, arg2, arg3, arg4, arg5, arg6, &done, 0); + if (done) + goto out; + + if (!scene_enabled()) + return false; + + io_scene_before(arg1, arg2, arg3, arg4, arg5, arg6); + mm_scene_before(arg1, arg2, arg3, arg4, arg5, arg6); + +out: + return ret; +} + +void scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret) +{ + bool done = false; + + do_scene_template_after(arg1, arg2, arg3, arg4, arg5, ret, 0, &done, 0); + if (done) + goto out; + + if (!scene_enabled()) + goto out; + + io_scene_after(arg1, arg2, arg3, arg4, arg5, ret); + mm_scene_after(arg1, arg2, arg3, arg4, arg5, ret); +out: + return; +} +unsigned long do_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, int name) +{ + unsigned long ret; + + ret = do_template_scene(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, done, name); + if (done) + goto out; + + if (!scene_enabled()) { + *done = false; + goto out; + } + + ret = do_io_scene(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, done, name); + if (! *done) + ret = do_net_scene(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, done, name); + +out: + return ret; +} + +#define DEFINE_BEFORE_AFTER(name) \ +unsigned long scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6) \ +{ \ +\ + unsigned long ret; \ + bool done = false; \ +\ + ret = do_scene_template_before(arg1, arg2, arg3, arg4, arg5, arg6, &done, name); \ + if (done) \ + goto out; \ +\ + if (!scene_enabled()) \ + return 0; \ +\ + ret = io_scene_before_##name(arg1, arg2, arg3, arg4, arg5, arg6); \ + if (ret == 0) \ + ret = net_scene_before_##name(arg1, arg2, arg3, arg4, arg5, arg6); \ +out: \ + return ret; \ +};\ +\ +void scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test) \ +{ \ + bool done = false; \ +\ + do_scene_template_after(arg1, arg2, arg3, arg4, arg5, ret, test, &done, name); \ + if (done) \ + return; \ +\ + if (!scene_enabled()) \ + return; \ +\ + io_scene_after_##name(arg1, arg2, arg3, arg4, arg5, ret, test); \ + net_scene_after_##name(arg1, arg2, arg3, arg4, arg5, ret, test); \ +}; +DEFINE_BEFORE_AFTER(1) +DEFINE_BEFORE_AFTER(2) +DEFINE_BEFORE_AFTER(3) +DEFINE_BEFORE_AFTER(4) +DEFINE_BEFORE_AFTER(5) +DEFINE_BEFORE_AFTER(6) +DEFINE_BEFORE_AFTER(7) +DEFINE_BEFORE_AFTER(8) +DEFINE_BEFORE_AFTER(9) +DEFINE_BEFORE_AFTER(10) +DEFINE_BEFORE_AFTER(11) +DEFINE_BEFORE_AFTER(12) +DEFINE_BEFORE_AFTER(13) +DEFINE_BEFORE_AFTER(14) +DEFINE_BEFORE_AFTER(15) +DEFINE_BEFORE_AFTER(16) +DEFINE_BEFORE_AFTER(17) +DEFINE_BEFORE_AFTER(18) +DEFINE_BEFORE_AFTER(19) +DEFINE_BEFORE_AFTER(20) +DEFINE_BEFORE_AFTER(21) +DEFINE_BEFORE_AFTER(22) +DEFINE_BEFORE_AFTER(23) +DEFINE_BEFORE_AFTER(24) +DEFINE_BEFORE_AFTER(25) +DEFINE_BEFORE_AFTER(26) +DEFINE_BEFORE_AFTER(27) +DEFINE_BEFORE_AFTER(28) +DEFINE_BEFORE_AFTER(29) +DEFINE_BEFORE_AFTER(30) +DEFINE_BEFORE_AFTER(31) +DEFINE_BEFORE_AFTER(32) +DEFINE_BEFORE_AFTER(33) +DEFINE_BEFORE_AFTER(34) +DEFINE_BEFORE_AFTER(35) +DEFINE_BEFORE_AFTER(36) +DEFINE_BEFORE_AFTER(37) +DEFINE_BEFORE_AFTER(38) +DEFINE_BEFORE_AFTER(39) +DEFINE_BEFORE_AFTER(40) +DEFINE_BEFORE_AFTER(41) +DEFINE_BEFORE_AFTER(42) +DEFINE_BEFORE_AFTER(43) +DEFINE_BEFORE_AFTER(44) +DEFINE_BEFORE_AFTER(45) +DEFINE_BEFORE_AFTER(46) +DEFINE_BEFORE_AFTER(47) +DEFINE_BEFORE_AFTER(48) +DEFINE_BEFORE_AFTER(49) +DEFINE_BEFORE_AFTER(50) + +void init_scene(void) +{ + int i; + int size = IO_ARRAY_ITEM * 4 * sizeof(int)+ 2 * IO_ARRAY_SIZE * IO_ARRAY_ITEM * sizeof(char); + + stat_write = (unsigned int *)vmalloc(size); + if (!stat_write) + goto out; + stat_process = stat_write + IO_ARRAY_ITEM; + stat_add = stat_process + IO_ARRAY_ITEM; + stat_add_process = stat_add + IO_ARRAY_ITEM; + for (i = 0; i < IO_ARRAY_ITEM; i++) { + stat_proc_comm[i] = (char*)((char *)stat_add_process + IO_ARRAY_ITEM + i * IO_ARRAY_SIZE); + stat_add_proc_comm[i] = (char*)((char *)stat_add_process + IO_ARRAY_ITEM + + (i + IO_ARRAY_ITEM) * IO_ARRAY_SIZE); + } +out: + init_io_scene(); +} + +void exit_scene(void) +{ + exit_io_scene(); + + if (stat_write) + vfree(stat_write); +} + diff --git a/ops/os_stat/os_stat/scene_layer.h b/ops/os_stat/os_stat/scene_layer.h new file mode 100644 index 0000000000000000000000000000000000000000..4b6b1cf35880e0da20c9b8f35ae19404d7ea047c --- /dev/null +++ b/ops/os_stat/os_stat/scene_layer.h @@ -0,0 +1,229 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#ifndef _SCENE_LAYER_H +#define _SCENE_LAYER_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TK4_OLD +#include "ext4.h" +#else +#include "ext4.h" +#endif +#include "kprobe_prehook.h" + +#define IO_ARRAY_ITEM 2048 +#define IO_ARRAY_SIZE 16 +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) + +//sysctl_enable_debug +enum SCENE_SELECT { + /* two syctl:printk_struct_first_name, printk_struct_last_name + * such as printk_struct_first_name=fs, printk_struct_last_name=disk + */ + SCENE_ADJUST_POSITION = 1, + SCENE_ADJUST_ENABLE_ONLY = 2, + /* hook variables, uses sysctl:ftrace_hook_one_var */ + SCENE_ADJUST_HOOK_VAR = 3, + /* 1.two syctl:printk_struct_first_name, printk_struct_last_name + * 2.hook variables, uses sysctl:ftrace_hook_one_var + */ + SCENE_ADJUST_MORE_PARAMETER_ONE = 4, + SCENE_ADJUST_MORE_PARAMETER_TWO = 5, + /* enable memory scene stat */ + SCENE_ADJUST_MEM_STAT = 6, +}; + +extern int cur_total_index; +extern int cur_total_add_index; +extern int *stat_write; +extern int *stat_process; +extern char *stat_proc_comm[IO_ARRAY_ITEM]; +extern int *stat_add; +extern int *stat_add_process; +extern char *stat_add_proc_comm[IO_ARRAY_ITEM]; +extern unsigned long sysctl_enable_debug; +extern unsigned int sysctl_module_enable_slub;// enable module by sysctl +extern unsigned int sysctl_module_enable_irq;// enable slub debug by sysctl +extern unsigned long *p_test_var_1; +extern unsigned long *p_test_var_2; +extern unsigned long *p_test_var_3; +extern unsigned long *p_test_var_4; +extern unsigned long *p_test_var_5; +extern unsigned long (*p__test_func1)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long (*p__test_func2)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long (*p__test_func3)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long (*p__test_func4)(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func1(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func2(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func3(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func4(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func5(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long test_func6(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8); +extern unsigned long do_io_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, + int name); +extern bool io_scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern void io_scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern bool mm_scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern void mm_scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern bool scene_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); +extern void scene_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret); +extern unsigned long do_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, int name); +extern unsigned long do_scene_template_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, bool *done, int name); +extern void do_scene_template_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test, bool *done, int name); +extern unsigned long do_template_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, + int name); +extern bool scene_enabled(void); +extern bool debug_enabled(void); +#ifndef TK2 +int sysctl_irq_enable_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif +extern int sysctl_slub_enable_handler(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos); +#else + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif +extern unsigned long do_net_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, + int name); +#define DEFINE_BEFORE_AFTER_EXTERN(name) \ +extern unsigned long scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6); \ +extern void scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test); \ +extern unsigned long io_scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6); \ +extern void io_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test); \ +extern unsigned long net_scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6); \ +extern void net_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test); +#else +static int sysctl_slub_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos){return 0;} +static int sysctl_irq_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos){return 0;} +static unsigned long do_net_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, + int name) +{ +} +#define DEFINE_BEFORE_AFTER_EXTERN(name) \ +extern unsigned long scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6); \ +extern void scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test); \ +extern unsigned long io_scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6); \ +extern void io_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test); \ +static unsigned long net_scene_before_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long arg6){return 0;} \ +static void net_scene_after_##name(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, \ + unsigned long arg5, unsigned long ret, unsigned long test){return 0;} +#endif +DEFINE_BEFORE_AFTER_EXTERN(1) +DEFINE_BEFORE_AFTER_EXTERN(2) +DEFINE_BEFORE_AFTER_EXTERN(3) +DEFINE_BEFORE_AFTER_EXTERN(4) +DEFINE_BEFORE_AFTER_EXTERN(5) +DEFINE_BEFORE_AFTER_EXTERN(6) +DEFINE_BEFORE_AFTER_EXTERN(7) +DEFINE_BEFORE_AFTER_EXTERN(8) +DEFINE_BEFORE_AFTER_EXTERN(9) +DEFINE_BEFORE_AFTER_EXTERN(10) +DEFINE_BEFORE_AFTER_EXTERN(11) +DEFINE_BEFORE_AFTER_EXTERN(12) +DEFINE_BEFORE_AFTER_EXTERN(13) +DEFINE_BEFORE_AFTER_EXTERN(14) +DEFINE_BEFORE_AFTER_EXTERN(15) +DEFINE_BEFORE_AFTER_EXTERN(16) +DEFINE_BEFORE_AFTER_EXTERN(17) +DEFINE_BEFORE_AFTER_EXTERN(18) +DEFINE_BEFORE_AFTER_EXTERN(19) +DEFINE_BEFORE_AFTER_EXTERN(20) +DEFINE_BEFORE_AFTER_EXTERN(21) +DEFINE_BEFORE_AFTER_EXTERN(22) +DEFINE_BEFORE_AFTER_EXTERN(23) +DEFINE_BEFORE_AFTER_EXTERN(24) +DEFINE_BEFORE_AFTER_EXTERN(25) +DEFINE_BEFORE_AFTER_EXTERN(26) +DEFINE_BEFORE_AFTER_EXTERN(27) +DEFINE_BEFORE_AFTER_EXTERN(28) +DEFINE_BEFORE_AFTER_EXTERN(29) +DEFINE_BEFORE_AFTER_EXTERN(30) +DEFINE_BEFORE_AFTER_EXTERN(31) +DEFINE_BEFORE_AFTER_EXTERN(32) +DEFINE_BEFORE_AFTER_EXTERN(33) +DEFINE_BEFORE_AFTER_EXTERN(34) +DEFINE_BEFORE_AFTER_EXTERN(35) +DEFINE_BEFORE_AFTER_EXTERN(36) +DEFINE_BEFORE_AFTER_EXTERN(37) +DEFINE_BEFORE_AFTER_EXTERN(38) +DEFINE_BEFORE_AFTER_EXTERN(39) +DEFINE_BEFORE_AFTER_EXTERN(40) +DEFINE_BEFORE_AFTER_EXTERN(41) +DEFINE_BEFORE_AFTER_EXTERN(42) +DEFINE_BEFORE_AFTER_EXTERN(43) +DEFINE_BEFORE_AFTER_EXTERN(44) +DEFINE_BEFORE_AFTER_EXTERN(45) +DEFINE_BEFORE_AFTER_EXTERN(46) +DEFINE_BEFORE_AFTER_EXTERN(47) +DEFINE_BEFORE_AFTER_EXTERN(48) +DEFINE_BEFORE_AFTER_EXTERN(49) +DEFINE_BEFORE_AFTER_EXTERN(50) +extern void init_scene(void); +extern void exit_scene(void); +extern void init_io_scene(void); +extern void exit_io_scene(void); +extern void store_info(unsigned long arg1, unsigned long arg2, unsigned long arg3, \ + char *name1, char*name2, char *name3); +extern void match_index(unsigned long arg2, int index); +extern void match_dec_index(unsigned long arg2, bool print, int index); +struct open_flags { + int open_flag; + umode_t mode; + int acc_mode; + int intent; + int lookup_flags; +}; +#endif diff --git a/ops/os_stat/os_stat/scene_template.c b/ops/os_stat/os_stat/scene_template.c new file mode 100644 index 0000000000000000000000000000000000000000..fe451e857d8e65f25dc47b7fa5bb2ce468c31baa --- /dev/null +++ b/ops/os_stat/os_stat/scene_template.c @@ -0,0 +1,129 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ + +#include "scene_layer.h" +#include "data_aware.h" + +static char functions[][32] = { + "ep_autoremove_wake_function", //change your function which to debug + "bfq_dispatch_request", //change your function which to debug + "ata_qc_complete_internal" //...... + "function_namen", //change your function which to debug +}; + +/* + * To trace some functions, such as vfs_read, __alloc_pages_nodemask .etc + * U can trace more functions(now < 50), need two steps: + * Step 1, add debug code, if only see function parameters and return value, + * see do_scene_template_before(); + * see do_scene_template_after(); + * if need copy whole functions to here, see do_template_scene + * Step 2, make modules, likes: make -C /usr/src/kernels/5.4.241-1-tlinux4-0017.10/ + M=`pwd` modules version=5.4 + * Step 3, sign and insmod ko + * Step 4, function can be any one of above functions, no matter order or sequence. + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_function + * + * Step 5, see debug infomation + * + * Other: when added to interface layer, you can also use: + * t-ops os_stat -f f1 ... .etc + * or + * t-ops interaction ... .etc +*/ +/* see front part of function, could see parameters */ +unsigned long do_scene_template_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, bool *done, int name) +{ + unsigned long ret; + + /* + //struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + // int preferred_nid, nodemask_t *nodemask) + if (strstr(get_one_func_name(name), functions[3])) { + unsigned int order = arg2; // the 2cd parameter of __alloc_pages_nodemask + // is order, then order = arg2; + // get other parameter likes this; + /add more debug info/ + } + */ + *done = false; + if(!debug_enabled()) + return ret; + + if (strstr(ftrace_hook_name, functions[0])) { + *done = true; + dump_stack(); + } + + if (strstr(get_one_func_name(name), functions[2])) { + *done = true; + /*add more debug info*/ + } + if (strstr(get_one_func_name(name), functions[1])) { + *done = true; + /*add more debug info*/ + } + return ret; +} + +/* see back part of function, could see parameters and return value */ +void do_scene_template_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test, bool *done, int name) +{ + if(!debug_enabled()) + return; + + if (strstr(get_one_func_name(name), functions[1])) { + /*add more debug info*/ + } + if (strstr(get_one_func_name(name), functions[2])) { + /*add more debug info*/ + } +} + +/* Method1: see whole body of function, could see any lines of the function */ +/* need copy whole code of function here */ +/* Method2: only modify parameter, can call __test_func##name */ +unsigned long do_template_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, int name) +{ + unsigned long ret = 0; + + *done = false; + if(!debug_enabled()) + return ret; + /* Method 1: copy function body form kernel */ + /* step 1: copy function code here from linux kernel */ + /* step 2: *done = true */ + + /* Method 2: only modify parameter, not copy function + * to get function body function:get_one_func_body(functions[])*/ + /* which is origin function body */ + /* this can be used to modify parameters only */ + /*if (strstr(get_one_func_name(0), functions[0])) { + func_body_type func; + func = get_one_func_body(functions[]); + ret = (*func)(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + *done = true; + return ret; + }*/ + + /* For method 1: if function body has sub functions, + * such as sub function is functions[1], + * and functions[1] can be hooked by ftrace, + * we can call functions[1] in function body like this, + */ + /*if (strstr(get_one_func_name(1), functions[1])) { + func_type func; + func = get_one_func(functions[]); + func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + *done = false; + }*/ + return ret; +} diff --git a/ops/os_stat/os_stat/scene_template_example.c b/ops/os_stat/os_stat/scene_template_example.c new file mode 100644 index 0000000000000000000000000000000000000000..86b0acd4f82f661641d037dc35ac199fd12ce45c --- /dev/null +++ b/ops/os_stat/os_stat/scene_template_example.c @@ -0,0 +1,542 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#include +#include +#include +#include "scene_layer.h" +#include "data_aware.h" + +/* + * I. to hook tcp_sendmsg and copy its whole code: + * 1.copy its code here, change its name into "stat_tcp_recvmsg", see below. + * 2.in do_template_scene: + * (1) call stat_tcp_recvmsg(); + * (2) *done = true; + * 3.subfunction of tcp_sendmsg(order is casual) + * a.sub functiion which can be hooked, see step 5: + * (1) add tcp_recv_timestamp, tcp_rcv_space_adjust + * tcp_cleanup_rbuf,inet_recv_error to funcions[] any order + * just see below code. + * b.if not be hooked, like these: + * tcp_recv_urg tcp_peek_sndq tcp_inq_hint .etc + * copy there code here, see below. + * 4.add header files, such as , + * when make module, some struct error, then add its header files + * 5.modify functions[], which are hooked. + * 6.make module, sign, and install ko + * II. if only moidify function's parameter, + * 1. add function name into functions[] + * 2. uses do_template_scene method 2 is ok + * 3. make module, sign, and install ko + * 4. echo $function > /proc/sys/os_aware/ftrace_hook_function + * III. only see parameter or return value + * 1.uses do_scene_template_before to see parameter + * 2.uses do_scene_template_after to see parameter or return value + * 3.no need to uses do_template_scene. + * 4.modify functions[], add function name + * 5.add print info in do_scene_template_before or do_scene_template_after + * + * if need, code above can be added to t-ops interaction and command interface + * the uses t-ops ... + */ + +static char functions[][32] = { + "tcp_recvmsg", //change your function which to debug + "tcp_recv_timestamp", //change your function which to debug + "tcp_rcv_space_adjust", //change your function which to debug + "tcp_cleanup_rbuf", //change your function which to debug + "inet_recv_error", //change your function which to debug +}; + +/* + * To trace some functions, such as vfs_read, __alloc_pages_nodemask .etc + * U can trace more functions(now < 50), need two steps: + * Step 1, add debug code, if only see function parameters and return value, + * see do_scene_template_before(); + * see do_scene_template_after(); + * if need copy whole functions to here, see do_template_scene + * Step 2, make modules, likes: make -C /usr/src/kernels/5.4.241-1-tlinux4-0017.10/ + M=`pwd` modules version=5.4 + * Step 3, sign and insmod ko + * Step 4, function can be any one of above functions, no matter order or sequence. + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_one_function + * echo $function > ftrace_hook_function + * + * Step 5, see debug infomation + * + * Other: when added to interface layer, you can also use: + * t-ops os_stat -f f1 ... .etc + * or + * t-ops interaction ... .etc +*/ +/* see front part of function, could see parameters */ +unsigned long do_scene_template_before(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, bool *done, int name) +{ + unsigned long ret; + + if (!scene_enabled()) { + *done = false; + return 0; + } + /* + //struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + // int preferred_nid, nodemask_t *nodemask) + if (strstr(get_one_func_name(name), functions[3])) { + unsigned int order = arg2; // the 2cd parameter of __alloc_pages_nodemask + // is order, then order = arg2; + // get other parameter likes this; + /add more debug info/ + } + */ + if (strstr(get_one_func_name(name), functions[0])) { + /*add more debug info*/ + } + if (strstr(get_one_func_name(name), functions[1])) { + /*add more debug info*/ + } + return ret; +} + +/* see back part of function, could see parameters and return value */ +void do_scene_template_after(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long ret, unsigned long test, bool *done, int name) +{ + if (strstr(get_one_func_name(name), functions[0])) { + /*add more debug info*/ + } + if (strstr(get_one_func_name(name), functions[2])) { + /*add more debug info*/ + } +} +static void tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + if (skb->tstamp) + tss->ts[0] = ktime_to_timespec64(skb->tstamp); + else + tss->ts[0] = (struct timespec64) {0}; + + if (skb_hwtstamps(skb)->hwtstamp) + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); + else + tss->ts[2] = (struct timespec64) {0}; +} +static int tcp_inq_hint(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 copied_seq = READ_ONCE(tp->copied_seq); + u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); + int inq; + + inq = rcv_nxt - copied_seq; + if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { + lock_sock(sk); + inq = tp->rcv_nxt - tp->copied_seq; + release_sock(sk); + } + /* After receiving a FIN, tell the user-space to continue reading + * by returning a non-zero inq. + */ + if (inq == 0 && sock_flag(sk, SOCK_DONE)) + inq = 1; + return inq; +} +static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) +{ + struct sk_buff *skb; + int copied = 0, err = 0; + + /* XXX -- need to support SO_PEEK_OFF */ + + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + + skb_queue_walk(&sk->sk_write_queue, skb) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + break; + + copied += skb->len; + } + + return err ?: copied; +} +static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* No URG data to read. */ + if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || + tp->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) + return -ENOTCONN; + + if (tp->urg_data & TCP_URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags |= MSG_OOB; + + if (len > 0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_to_msg(msg, &c, 1); + len = 1; + } else + msg->msg_flags |= MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +} +int stat_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err, inq; + int target; /* Read at least this many bytes */ + long timeo; + struct sk_buff *skb, *last; + u32 urg_hole = 0; + struct scm_timestamping_internal tss; + int cmsg_flags; + + if (unlikely(flags & MSG_ERRQUEUE)) { + func_type func4; + func4 = get_one_func(functions[4]); + return func4(sk, msg, len, addr_len, 0, 0, 0, 0); + //return inet_recv_error(sk, msg, len, addr_len); + } + + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && + (sk->sk_state == TCP_ESTABLISHED)) + sk_busy_loop(sk, nonblock); + + lock_sock(sk); + + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + cmsg_flags = tp->recvmsg_inq ? 1 : 0; + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + if (unlikely(tp->repair)) { + err = -EPERM; + if (!(flags & MSG_PEEK)) + goto out; + + if (tp->repair_queue == TCP_SEND_QUEUE) + goto recv_sndq; + + err = -EINVAL; + if (tp->repair_queue == TCP_NO_QUEUE) + goto out; + + /* 'common' recv queue MSG_PEEK-ing */ + } + + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + u32 offset; + + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } + + /* Next get a buffer. */ + + last = skb_peek_tail(&sk->sk_receive_queue); + skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) + break; + + offset = *seq - TCP_SKB_CB(skb)->seq; + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); + offset--; + } + if (offset < skb->len) + goto found_ok_skb; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + WARN(!(flags & MSG_PEEK), + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); + } + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + func_type func3; + func3 = get_one_func(functions[3]); + func3(sk, copied, 0, 0, 0, 0, 0, 0); + // tcp_cleanup_rbuf(sk, copied); + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + sk_wait_data(sk, &timeo, last); + } + + if ((flags & MSG_PEEK) && + (peek_seq - copied - urg_hole != tp->copied_seq)) { + net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", + current->comm, + task_pid_nr(current)); + peek_seq = tp->copied_seq; + } + continue; + +found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sock_flag(sk, SOCK_URGINLINE)) { + WRITE_ONCE(*seq, *seq + 1); + urg_hole++; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_msg(skb, offset, msg, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + WRITE_ONCE(*seq, *seq + used); + copied += used; + len -= used; + + func_type func2; + func2 = get_one_func(functions[2]); + func2(sk, 0, 0, 0, 0, 0, 0, 0); + //tcp_rcv_space_adjust(sk); + +skip_copy: + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + tp->urg_data = 0; + tcp_fast_path_check(sk); + } + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + tcp_update_recv_tstamps(skb, &tss); + cmsg_flags |= 2; + } + + if (used + offset < skb->len) + continue; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + +found_fin_ok: + /* Process the FIN. */ + WRITE_ONCE(*seq, *seq + 1); + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (len > 0); + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + func_type func3; + func3 = get_one_func(functions[3]); + func3(sk, copied, 0, 0, 0, 0, 0, 0); + //tcp_cleanup_rbuf(sk, copied); + + release_sock(sk); + + if (cmsg_flags) { + if (cmsg_flags & 2) { + func_type func1; + func1 = get_one_func(functions[1]); + func1(msg, sk, &tss, 0, 0, 0, 0, 0); + } + //tcp_recv_timestamp(msg, sk, &tss); + if (cmsg_flags & 1) { + inq = tcp_inq_hint(sk); + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } + } + + return copied; + +out: + release_sock(sk); + return err; + +recv_urg: + err = tcp_recv_urg(sk, msg, len, flags); + goto out; + +recv_sndq: + err = tcp_peek_sndq(sk, msg, len); + goto out; +} +/* Method1: see whole body of function, could see any lines of the function */ +/* need copy whole code of function here */ +/* Method2: only modify parameter, can call __tesst_func##name */ +unsigned long do_template_scene(unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, unsigned long arg7, unsigned long arg8, bool *done, int name) +{ + unsigned long ret = 0; + + *done = false; + if (!scene_enabled()) + return 0; + + /* Method 1: copy function body form kernel */ + /* step 1: copy function code here from linux kernel */ + /* step 2: *done = true */ + /*if (strstr(get_one_func_name(name), functions[0])) { + *done = true; + }*/ + + if (strstr(get_one_func_name(name), functions[0])) { + if (!get_one_func(functions[4]) || !get_one_func(functions[3]) + || !get_one_func(functions[2]) || !get_one_func(functions[1])) + return 0; + + ret = stat_tcp_recvmsg(arg1, arg2, arg3, arg4, arg5, arg6); + *done = true; + } + + /* Method 2: only modify parameter, not copy function + * to get function body function:get_one_func_body(functions[])*/ + /* which is origin function body */ + /* this can be used to modify parameters only */ + /*if (strstr(get_one_func_name(0), functions[0])) { + func_body_type func; + func = get_one_func_body(functions[]); + ret = (*func)(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + *done = true; + return ret; + }*/ + + /* For method 1: if function body has sub functions, + * such as sub function is functions[1], + * and functions[1] can be hooked by ftrace, + * we can call functions[1] in function body like this, + */ + /*if (strstr(get_one_func_name(1), functions[1])) { + func_type func; + func = get_one_func(functions[]); + func(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); + }*/ + return ret; +} diff --git a/ops/os_stat/os_stat/store_data.c b/ops/os_stat/os_stat/store_data.c new file mode 100644 index 0000000000000000000000000000000000000000..1d79afd9b7270fadbc1f98c3d6d151e6380ef38e --- /dev/null +++ b/ops/os_stat/os_stat/store_data.c @@ -0,0 +1,198 @@ +/* + * find kernel problem + * aurelianliu@tencent.com + */ +#include "data_aware.h" +#include "scene_layer.h" + +static int print_total_index; +void store_info(unsigned long arg1, unsigned long arg2, unsigned long arg3, char *name1, char*name2, char *name3) +{ + int index; + char *proc; + + if (message_info_stat) { + index = message_end++ % message_total_index; + message_info_stat[index].nr = arg1; + message_info_stat[index].num = arg2; + message_info_stat[index].latency = arg3; + + proc = message_info_stat[index].func; + strncpy(message_info_stat[index].func, name1, NAME_MAX >> 1); + if (name2 && name3) { + proc += (NAME_MAX >> 1); + sprintf(proc, "%s---%s", name2, name3); + } + } +} + +void get_info(int len, +#ifdef TK5 + void *buffer) +#else + void __user *buffer) +#endif +{ + int tmp_len, len_1 = 0, len_2, i; + int start = message_start % message_total_index, end = message_end % message_total_index; + + len_1 = (end - start); + if (end < start) + len_1 = message_total_index - start; + len_2 = len; + if (len_1 == 0) + return; + + for (i = 0; i < 2; i++) { + len_1 *= sizeof(struct func_latency); + len_1 = len_2 < len_1? len_2:len_1; +#ifdef TK5 + memcpy(buffer, &message_info_stat[start], len_1); +#else + copy_to_user(buffer, &message_info_stat[start], len_1); +#endif + message_start += 1; + if (message_start == message_end) + break; + buffer += len_1; + len_2 = len - len_1; + if (end < start) + len_1 = end; + start = 0; + } +} + +int sysctl_get_kernel_info(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + size_t len = message_total_index * sizeof(struct func_latency); + + len = len < *lenp? len:*lenp; + get_info(len, buffer); + + return 0; +} + +void match_index(unsigned long arg2, int index) +{ + int i; + char buf[32]; + for (i = 0; i < IO_ARRAY_ITEM; i++) { + if (stat_add_process[i] == current->pid || stat_add_process[i] == 0) + break; + } + if (i >= IO_ARRAY_ITEM) + goto out; + + if (cur_total_add_index < i) + cur_total_add_index = i; + + if (sysctl_enable_debug) + stat_add[i] += arg2; + else + stat_add[i]++; + if (stat_add_process[i] == 0) { + stat_add_process[i] = current->pid; + strncpy(stat_add_proc_comm[i], current->comm, 16); + } + + if (stat_add[i] != 0 && stat_add[i] % 100 == 0) { + sprintf(buf, "total add:%d:", index); + store_info(current->pid, stat_add[i], stat_add_process[i], current->comm, buf, + "process access stat"); + } +out: + + return; +} + +void match_dec_index(unsigned long arg2, bool print, int index) +{ + int i; + char buf[32]; + + for (i = 0; i < IO_ARRAY_ITEM; i++) { + if (stat_process[i] == current->pid || stat_process[i] == 0) + break; + } + if (i >= IO_ARRAY_ITEM) + goto next; + + pr_info("----%s, %d", __func__, __LINE__); + + if (sysctl_enable_debug) + stat_write[i] += arg2; + else + stat_write[i]++; + if (stat_process[i] == 0) { + stat_process[i] = current->pid; + strncpy(stat_proc_comm[i], current->comm, 16); + } + + if (cur_total_index < i) + cur_total_index = i; + + if (!print && stat_write[i] != 0 && stat_write[i] % 100 == 0) { + sprintf(buf, "total dec:%d:", index); + store_info(current->pid, stat_write[i], stat_process[i], current->comm, buf, + "process access stat"); + } else if (print && stat_write[i] != 0 && stat_write[i] % 100 == 0) + pr_info("%s, proc:%16s, %8d, write:%d, %d, %d", + ftrace_hook_name, current->comm, current->pid, + stat_write[i], stat_process[i], i); +next: + + return; +} +int sysctl_get_kernel_data(struct ctl_table *table, int write, +#ifdef TK5 + void *buffer, size_t *lenp, loff_t *ppos) +#else + void __user *buffer, size_t *lenp, loff_t *ppos) +#endif +{ + int ret = 0, index; + int total_dec, total_add, total; + struct func_latency data; + size_t len = sizeof(struct func_latency); + char *proc; + + if (!stat_write) + return 0; + + len = len < *lenp? len:*lenp; + + total_add = cur_total_add_index >= IO_ARRAY_ITEM ? IO_ARRAY_ITEM:cur_total_add_index; + total_dec = cur_total_index >= IO_ARRAY_ITEM ? IO_ARRAY_ITEM:cur_total_index; + total = total_add + total_dec; + if (total == 0) + total = 1; + + index = print_total_index++ % total; + + if (index < total_dec) { + data.nr = stat_process[index]; + data.num = stat_write[index]; + strcpy(data.func, stat_proc_comm[index]); + strcat(data.func, "---dec"); + } else { + index -= total_dec; + if (index < 0) + index = 0; + data.nr = stat_add_process[index]; + data.num = stat_add[index]; + strcpy(data.func, stat_add_proc_comm[index]); + strcat(data.func, "---add"); + } +#ifdef TK5 + memcpy(buffer, &data, len); +#else + copy_to_user(buffer, &data, len); +#endif + + return ret; +} diff --git a/ops/os_stat/os_stat/syms.c b/ops/os_stat/os_stat/syms.c new file mode 100644 index 0000000000000000000000000000000000000000..09233830ef2ebf75d1e4de31e8b6c51fde8863ed --- /dev/null +++ b/ops/os_stat/os_stat/syms.c @@ -0,0 +1,203 @@ +/* + * Handle kernel unexport symbols + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#ifdef TK5 +#include +#endif +#include +#include "syms.h" + +/* + * Unexport variables + */ +/* + * Unexport symbols + */ +struct rq __percpu *p_runqueues; +struct list_head *p_slab_caches; +unsigned long *p_totalreserve_pages; +struct wb_domain *p_global_wb_domain; +unsigned int *p_bdi_min_ratio; +#ifdef TK5 +sys_call_ptr_t *p_sys_call_table[NR_syscalls+1]; +struct mutex *p_module_mutex; +#else +#ifdef CONFIG_X86_64 +sys_call_ptr_t *p_sys_call_table[__NR_syscall_max+1]; +#elif defined CONFIG_ARM64 +syscall_fn_t *p_sys_call_table[__NR_syscalls]; +#endif +#endif +/* + * Scheduler hooks + */ +u64 (*p_stat_arch_irq_stat_cpu)(unsigned int cpu); +u64 (*p_stat_arch_irq_stat)(void); +void (*p_stat_do_syscall_64)(unsigned long nr, struct pt_regs *regs); +int (*p_idle_cpu)(int cpu); +void (*p_stat_sched_rqm_switch)(struct rq *rq, struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_64 +#ifdef TK5 +void (*p_stat_handle_level_irq)(struct irq_desc *desc); +void (*p_stat_handle_fasteoi_irq)(struct irq_desc *desc); +void (*p_stat_handle_edge_irq)(struct irq_desc *desc); +void (*p_stat_handle_simple_irq)(struct irq_desc *desc); +#else +unsigned int (*p_stat_do_IRQ)(struct pt_regs *regs); +#endif +#endif +#ifdef CONFIG_ARM64 +void (*p_stat_gic_handle_irq)(struct pt_regs *regs); +#endif +void (*p_stat_psi_task_switch)(struct task_struct *prev, struct task_struct *next, bool sleep); +struct rq *(*p_stat_finish_task_switch)(struct task_struct *prev); +struct task_struct * (*p_stat_pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags_stat *rf); +struct page * +(*p__alloc_pages_nodemask)(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask); +blk_qc_t (*p_submit_bio)(struct bio *bio); +void (*p_bio_endio)(struct bio *bio); +void *(*p__kmalloc)(size_t size, gfp_t flags); +void *(*p__kmalloc_node)(size_t size, gfp_t flags, int node); +void *(*p_kmem_cache_alloc)(struct kmem_cache *s, gfp_t gfpflags); +void (*p_do_kern_addr_fault)(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address); +void (*p_no_context)(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code); +void (*p_do_general_protection)(struct pt_regs *regs, long error_code); +void (*p_do_divide_error)(struct pt_regs *regs, long error_code); +#ifndef TK5 +bool (*p_blk_mq_get_driver_tag)(struct request *rq); +#else +bool (*p_blk_mq_dispatch_rq_list)(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets); +#endif +void (*p_do_page_fault)(struct pt_regs *regs, unsigned long error_code, unsigned long address); +void (*p_stat_rcu_note_context_switch)(bool preempt); +int (*p_stat_access_remote_vm)(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +#ifdef TK5 +vm_fault_t (*p_handle_mm_fault)(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs); +#endif + +#ifdef TK2 +void (*p__sched_fork)(struct task_struct *p); +void (*p_do_exit)(long code); +#endif + +struct ksym +{ + int type; + char *name; + void **address; +}; + +enum ksym_type { + KSYM_DEF, + KSYM_JMP_MCOUNT +}; +static struct ksym syms[] = { +#ifdef TK5 + {KSYM_DEF, "module_mutex", (void **)&p_module_mutex}, + {KSYM_JMP_MCOUNT, "__alloc_pages", (void **)&p__alloc_pages_nodemask}, + {KSYM_JMP_MCOUNT, "psi_task_switch", (void **)&p_stat_psi_task_switch}, + {KSYM_JMP_MCOUNT, "handle_mm_fault", (void **)&p_handle_mm_fault}, + {KSYM_JMP_MCOUNT, "blk_mq_dispatch_rq_list", (void **)&p_blk_mq_dispatch_rq_list}, +#else +#ifdef CONFIG_X86_64 + {KSYM_JMP_MCOUNT, "do_syscall_64", (void **)&p_stat_do_syscall_64}, + {KSYM_JMP_MCOUNT, "arch_irq_stat_cpu", (void **)&p_stat_arch_irq_stat_cpu}, + {KSYM_JMP_MCOUNT, "arch_irq_stat", (void **)&p_stat_arch_irq_stat}, +#endif + {KSYM_JMP_MCOUNT, "__alloc_pages_nodemask", (void **)&p__alloc_pages_nodemask}, + {KSYM_JMP_MCOUNT, "finish_task_switch", (void **)&p_stat_finish_task_switch}, + {KSYM_JMP_MCOUNT, "blk_mq_get_driver_tag", (void **)&p_blk_mq_get_driver_tag}, +#ifndef TK3 + {KSYM_JMP_MCOUNT, "do_page_fault", (void **)&p_do_page_fault}, +#endif +#ifdef TK4_NEW + {KSYM_JMP_MCOUNT, "sched_rqm_switch", (void **)&p_stat_sched_rqm_switch}, +#else + {KSYM_JMP_MCOUNT, "rcu_note_context_switch", (void **)&p_stat_rcu_note_context_switch}, +#endif +#endif + {KSYM_DEF, "runqueues", (void **)&p_runqueues}, + {KSYM_DEF, "slab_caches", (void **)&p_slab_caches}, + {KSYM_DEF, "global_wb_domain", (void **)&p_global_wb_domain}, + {KSYM_DEF, "bdi_min_ratio", (void **)&p_bdi_min_ratio}, + {KSYM_DEF, "totalreserve_pages", (void **)&p_totalreserve_pages}, + {KSYM_DEF, "sys_call_table", (void **)&p_sys_call_table}, + {KSYM_JMP_MCOUNT, "idle_cpu", (void **)&p_idle_cpu}, +#if defined(TK4_OLD) || defined(TK3) || defined(TK2) || CONFIG_ARM64 + {KSYM_JMP_MCOUNT, "pick_next_task_fair", (void **)&p_stat_pick_next_task}, +#else + {KSYM_JMP_MCOUNT, "pick_next_task", (void **)&p_stat_pick_next_task}, +#endif + {KSYM_JMP_MCOUNT, "submit_bio", (void **)&p_submit_bio}, + {KSYM_JMP_MCOUNT, "bio_endio", (void **)&p_bio_endio}, +#ifdef CONFIG_X86_64 + {KSYM_JMP_MCOUNT, "arch_irq_stat_cpu", (void **)&p_stat_arch_irq_stat_cpu}, + {KSYM_JMP_MCOUNT, "arch_irq_stat", (void **)&p_stat_arch_irq_stat}, +#ifdef TK5 + {KSYM_JMP_MCOUNT, "handle_level_irq", (void **)&p_stat_handle_level_irq}, + {KSYM_JMP_MCOUNT, "handle_fasteoi_irq", (void **)&p_stat_handle_fasteoi_irq}, + {KSYM_JMP_MCOUNT, "handle_edge_irq", (void **)&p_stat_handle_edge_irq}, + {KSYM_JMP_MCOUNT, "handle_simple_irq", (void **)&p_stat_handle_simple_irq}, +#else + {KSYM_JMP_MCOUNT, "do_IRQ", (void **)&p_stat_do_IRQ}, +#endif +#endif +#ifdef CONFIG_ARM64 + {KSYM_JMP_MCOUNT, "gic_handle_irq", (void **)&p_stat_gic_handle_irq}, +#endif +#ifdef TK2 + {KSYM_JMP_MCOUNT, "__sched_fork", (void **)&p__sched_fork}, + {KSYM_JMP_MCOUNT, "do_exit", (void **)&p_do_exit}, +#endif + {KSYM_JMP_MCOUNT, "access_remote_vm", (void **)&p_stat_access_remote_vm}, + {KSYM_JMP_MCOUNT, "__kmalloc", (void **)&p__kmalloc}, + {KSYM_JMP_MCOUNT, "__kmalloc_node", (void **)&p__kmalloc_node}, + {KSYM_JMP_MCOUNT, "kmem_cache_alloc", (void **)&p_kmem_cache_alloc}, + {KSYM_JMP_MCOUNT, "do_kern_addr_fault", (void **)&p_do_kern_addr_fault}, + {KSYM_JMP_MCOUNT, "no_context", (void **)&p_no_context}, + {KSYM_JMP_MCOUNT, "do_general_protection", (void **)&p_do_general_protection}, + {KSYM_JMP_MCOUNT, "do_divide_error", (void **)&p_do_divide_error}, + {0, 0, 0} +}; + +int ksyms_init(void) +{ + unsigned long addr; + int i = 0; + + /* Init kernel symbols */ + while (true) { + if (!syms[i].name) + break; + +#ifdef TK5 + addr = kallsyms_lookup_name_tk5(syms[i].name); +#else + addr = kallsyms_lookup_name(syms[i].name); +#endif + if (!addr) { + pr_err("symbol %s not found\n", syms[i].name); + return -ENODEV; + } + + switch (syms[i].type) { + case KSYM_DEF: + syms[i++].address[0] = (void *)addr; + break; + case KSYM_JMP_MCOUNT: + syms[i++].address[0] = (void *)(addr + MCOUNT_INSN_SIZE); + break; + default: + pr_err("symbol %s invalid type %d\n", syms[i].name, syms[i].type); + return -EINVAL; + } + } + return 0; +} diff --git a/ops/os_stat/os_stat/syms.h b/ops/os_stat/os_stat/syms.h new file mode 100644 index 0000000000000000000000000000000000000000..2d43554bea93af06017a1a37b757b8dbdfdf27f9 --- /dev/null +++ b/ops/os_stat/os_stat/syms.h @@ -0,0 +1,186 @@ +/* + * Handle kernel unexport symbols + * + * For unexported functions, we can find the address through + * kallsyms and assign it to the relevant function pointer. + * For unexported variables, we can find the address through + * kallsyms and copy it to the relevant variable pointer, or + * copy the variable value. For some immutable variables, it + * is generally not a big problem to directly copy the + * variable value. + * + * TODO: some variables value may change. + * + * brookxu@tencent.com 20220330 + */ +#ifndef _SYMS_H +#define _SYMS_H + +#include +#include +#include +#include +#include +#include +#include +#include "version.h" +#ifndef TK2 +#include +#endif +#include "hook_tk5.h" + +#ifdef CONFIG_X86_64 +#include "include/arch/x86/include/asm/syscall.h" +#elif defined CONFIG_ARM64 +typedef long (*syscall_fn_t)(const struct pt_regs *regs); +#endif +#ifdef TK2 +struct pin_cookie { unsigned int val; }; +#define blk_qc_t void +#endif +struct rq_flags_stat { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; +/* + * This is mainly to avoid the runqueues undefined compilation + * problem, We need to modify the implementation of macros such + * as cpu_rq, and the modification results are as follows: + * #define cpu_rq(cpu) (per_cpu_ptr(runqueues, (cpu))) + * #define this_rq() this_cpu_ptr(runqueues) + * #define raw_rq() raw_cpu_ptr(runqueues) + * + * Tips: runqueues is changed from a percpu object to a percpu + * pointer. + */ +#ifdef TK2 +typedef void (*sys_call_ptr_t)(void); +#endif + +#ifdef TK5 +extern sys_call_ptr_t *p_sys_call_table[NR_syscalls+1]; +extern struct mutex *p_module_mutex; +#define module_mutex_tk5 p_module_mutex +#else +#ifdef CONFIG_X86_64 +extern sys_call_ptr_t *p_sys_call_table[__NR_syscall_max+1]; +#elif defined CONFIG_ARM64 +extern syscall_fn_t *p_sys_call_table[__NR_syscalls]; +#endif +#endif + +#define stat_sys_call_table (*p_sys_call_table) +extern struct rq __percpu *p_runqueues; +#define runqueues (p_runqueues) +extern struct list_head *p_slab_caches; +#define _slab_caches (p_slab_caches) +extern struct wb_domain *p_global_wb_domain; +#define global_wb_domain (*p_global_wb_domain) +extern unsigned long *p_totalreserve_pages; +#define totalreserve_pages (*p_totalreserve_pages) +extern unsigned int *p_bdi_min_ratio; +#define bdi_min_ratio (*p_bdi_min_ratio) + +#ifdef CONFIG_X86_64 +extern void (*p_stat_do_syscall_64)(unsigned long nr, struct pt_regs *regs); +#define do_syscall_64(nr, regs) p_stat_do_syscall_64(nr, regs) +extern u64 (*p_stat_arch_irq_stat_cpu)(unsigned int cpu); +#define stat_arch_irq_stat_cpu(cpu) p_stat_arch_irq_stat_cpu(cpu) +#endif + +#ifdef CONFIG_X86_64 +#ifdef TK5 +extern void (*p_stat_handle_level_irq)(struct irq_desc *desc); +#define __handle_level_irq(desc) p_stat_handle_level_irq(desc) +extern void (*p_stat_handle_fasteoi_irq)(struct irq_desc *desc); +#define __handle_fasteoi_irq(desc) p_stat_handle_fasteoi_irq(desc) +extern void (*p_stat_handle_edge_irq)(struct irq_desc *desc); +#define __handle_edge_irq(desc) p_stat_handle_edge_irq(desc) +extern void (*p_stat_handle_simple_irq)(struct irq_desc *desc); +#define __handle_simple_irq(desc) p_stat_handle_simple_irq(desc) +#else +extern unsigned int (*p_stat_do_IRQ)(struct pt_regs *regs); +#define do_IRQ(regs) p_stat_do_IRQ(regs) +#endif +#endif +#ifdef CONFIG_ARM64 +extern void (*p_stat_gic_handle_irq)(struct pt_regs *regs); +#define gic_handle_irq(regs) p_stat_gic_handle_irq(regs) +#endif +extern void (*p_stat_sched_rqm_switch)(struct rq *rq, struct task_struct *prev, struct task_struct *next); +#define sched_rqm_switch(rq, prev, next) p_stat_sched_rqm_switch(rq, prev, next) + +extern struct rq *(*p_stat_finish_task_switch)(struct task_struct *prev); +#define finish_task_switch(prev) p_stat_finish_task_switch(prev) +extern int (*p_idle_cpu)(int cpu); +#define idle_cpu(cpu) p_idle_cpu(cpu) +extern struct page * +(*p__alloc_pages_nodemask)(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask); +#define __alloc_pages_nodemask(gfp_mask, order, preferred_nid, nodemask) p__alloc_pages_nodemask(gfp_mask, order, preferred_nid, nodemask) +extern u64 (*p_stat_arch_irq_stat)(void); +#define stat_arch_irq_stat() p_stat_arch_irq_stat() +extern blk_qc_t (*p_submit_bio)(struct bio *bio); +#define test_submit_bio(bio) p_submit_bio(bio) +extern void (*p_bio_endio)(struct bio *bio); +#define test_bio_endio(bio) p_bio_endio(bio) +extern void *(*p__kmalloc)(size_t size, gfp_t flags); +#define test__kmalloc(size, flags) p__kmalloc(size, flags) +extern void *(*p__kmalloc_node)(size_t size, gfp_t flags, int node); +#define test__kmalloc_node(size, flags, node) p__kmalloc_node(size, flags, node) +extern void *(*p_kmem_cache_alloc)(struct kmem_cache *s, gfp_t gfpflags); +#define test_kmem_cache_alloc(s, gfpflags) p_kmem_cache_alloc(s, gfpflags) +extern void (*p_do_kern_addr_fault)(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address); +#define test_do_kern_addr_fault(regs, hw_error_code, address) p_do_kern_addr_fault(regs, hw_error_code, address) +#define test_oops_end(flags, regs, signr) p_oops_end(flags, regs, signr) +extern void (*p_no_context)(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code); +#define test_no_context(regs, error_code, address, signal, si_code) p_no_context(regs, error_code, address, signal, si_code) +extern void (*p_do_general_protection)(struct pt_regs *regs, long error_code); +#define test_do_general_protection(regs, error_code) p_do_general_protection(regs, error_code) +extern void (*p_do_divide_error)(struct pt_regs *regs, long error_code); +#define test_do_divide_error(regs, error_code) p_do_divide_error(regs, error_code) +#ifdef TK5 +extern bool (*p_blk_mq_dispatch_rq_list)(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets); +#define test_blk_mq_dispatch_rq_list(hctx, list, nr_budgets) p_blk_mq_dispatch_rq_list(hctx, list, nr_budgets) +#else +extern bool (*p_blk_mq_get_driver_tag)(struct request *rq); +#define test_blk_mq_get_driver_tag(rq) p_blk_mq_get_driver_tag(rq) +extern int (*p_stat_access_remote_vm)(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +#define test_access_remote_vm(mm, addr, buf, len, gup_flags) p_stat_access_remote_vm(mm, addr, buf, len, gup_flags) + +extern void (*p_do_page_fault)(struct pt_regs *regs, unsigned long error_code, unsigned long address); +#define do_page_fault(regs, error_code, address) p_do_page_fault(regs, error_code, address); +#ifdef TK5 +extern vm_fault_t (*p_handle_mm_fault)(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs); +#define handle_mm_fault(vma, address, flags, regs) p_handle_mm_fault(vma, address, flags, regs) +extern void (*p_stat_psi_task_switch)(struct task_struct *prev, struct task_struct *next, bool sleep); +#define __psi_task_switch(prev, next, sleep) p_stat_psi_task_switch(prev, next, sleep) +#endif + +extern struct task_struct * (*p_stat_pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags_stat *rf); +#define pick_next_task(rq, prev, rf) p_stat_pick_next_task(rq, prev, rf) + +extern void (*p_stat_rcu_note_context_switch)(bool preempt); +#define rcu_note_context_switch(preempt) p_stat_rcu_note_context_switch(preempt) + +#ifdef TK2 +extern void (*p__sched_fork)(struct task_struct *p); +#define __sched_fork(p) p__sched_fork(p) +extern void (*p_do_exit)(long code); +#define do_exit(code) p_do_exit(code) +#endif +extern int ksyms_init(void); +#endif diff --git a/ops/os_stat/os_stat/sysctl.c b/ops/os_stat/os_stat/sysctl.c new file mode 100644 index 0000000000000000000000000000000000000000..4d9b58147d5960b26b963dd99d40c55d6ec9106c --- /dev/null +++ b/ops/os_stat/os_stat/sysctl.c @@ -0,0 +1,382 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "version.h" +#include "sysctl.h" +#include "kprobe_prehook.h" +#include "data_aware.h" +#include "scene_layer.h" +#include "hook.h" + +static int one_hundred = 100; +static int sysctl_two = 2; +static int one_thousand = 1000; +static int max = 1000000000; + +#if defined(TK3) || defined(TK2) + +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, INT_MAX }; + +/* Keep the same order as in fs/proc/proc_sysctl.c */ +#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) +#define SYSCTL_ONE ((void *)&sysctl_vals[1]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) + +#endif + +/* numa_aware sysctl variables */ +extern unsigned int sysctl_module_enable; +extern unsigned int sysctl_module_debug; +extern unsigned int sysctl_kprobe_unregister; +extern unsigned int sysctl_module_block_enable; +extern int register_ftrace_ftrace; +extern char sysctl_module_process_comm[NAME_MAX]; + +extern int sysctl_numa_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_kretprobe_enable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_kretprobe_disable_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_get_data_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_clear_data_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_get_func_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_get_kernel_data(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_ftrace_hook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_ftrace_unhook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_system_hook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_system_unhook_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_ftrace_func_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_ftrace_var_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + +struct func_latency data_latency; +static struct ctl_table os_aware_table[] = { + { + .procname = "enable", + .data = &sysctl_module_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_two, + }, + { + .procname = "enable_slub_debug", + .data = &sysctl_module_enable_slub, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_slub_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_two, + }, + { + .procname = "enable_irq_debug", + .data = &sysctl_module_enable_irq, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_irq_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_two, + }, + { + .procname = "control_stat", + .data = &sysctl_module_debug, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_clear_data_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "trace_type", + .data = &sysctl_trace_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "proc_comm", + .data = &sysctl_module_process_comm, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "monitor_sample_rate", + .data = &sysctl_module_monitor_sampling_rate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "var_offset_enable", + .data = &sysctl_module_offset_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "var_offset1", + .data = &sysctl_module_offset1, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "var_offset2", + .data = &sysctl_module_offset2, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "var_offset3", + .data = &sysctl_module_offset3, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "var_which", + .data = &sysctl_module_which_parameter, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &max, + }, + { + .procname = "enable_stat_block", + .data = &sysctl_module_block_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "debug_to_print", + .data = &sysctl_module_print, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "ftrace_success", + .data = ®ister_ftrace_ftrace, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "kret_probe_success", + .data = ®ister_kretprobe_ftrace, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "data", + .data = &sysctl_data, + .maxlen = sizeof(sysctl_data), + .mode = 0444, + .proc_handler = sysctl_get_data_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "func_data", + .data = &data_latency, + .maxlen = sizeof(struct func_latency), + .mode = 0444, + .proc_handler = sysctl_get_func_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "debug_data", + .data = &data_latency, + .maxlen = sizeof(struct func_latency), + .mode = 0444, + .proc_handler = sysctl_get_kernel_data, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "ftrace_hook_one_var", + .data = &ftrace_hook_name, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_ftrace_var_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "ftrace_hook_one_function", + .data = &ftrace_hook_name, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_ftrace_func_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "ftrace_hook_function", + .data = &ftrace_hook_name, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_ftrace_hook_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "ftrace_unhook_function", + .data = &ftrace_hook_name, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_ftrace_unhook_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "system_hook_function", + .data = &system_hook_name, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_system_hook_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "system_unhook_function", + .data = &system_hook_name, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_system_unhook_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "printk_struct_first_name", + .data = &printk_name_first, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "printk_struct_last_name", + .data = &printk_name_last, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "show_parameter", + .data = &show_parameter_val, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "show_parameter_type", + .data = &show_parameter_type, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "write_func_pointer", + .data = &func_pointer, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "get_func_pointer", + .data = &func_pointer_name, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "kprobe_register_func", + .data = &symbol_new, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_kretprobe_enable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "kprobe_unregister_func", + .data = &symbol_kret_new, + .maxlen = NAME_MAX * sizeof(char), + .mode = 0644, + .proc_handler = sysctl_kretprobe_disable_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { } +}; +#if defined(TK4_OLD) || defined(TK4_NEW) || defined(TK3) || defined(TK2) +static struct ctl_table numa_aware_root_table[] = { + { + .procname = "os_aware", + .maxlen = 0, + .mode = S_IRUGO|S_IXUGO, + .child = os_aware_table, + }, + { } +}; +#endif + +static struct ctl_table_header *module_root; + +int sysctl_table_init(void) +{ + /* Register sysctl */ +#ifdef TK5 + module_root = register_sysctl_sz("os_aware", os_aware_table, ARRAY_SIZE(os_aware_table)); +#else + module_root = register_sysctl_table(numa_aware_root_table); + if (!module_root) + return -ENOMEM; +#endif + + return 0; +} + +void sysctl_table_exit(void) +{ + /* Unregister sysctl */ + unregister_sysctl_table(module_root); +} diff --git a/ops/os_stat/os_stat/sysctl.h b/ops/os_stat/os_stat/sysctl.h new file mode 100644 index 0000000000000000000000000000000000000000..0e51ffd79a286870b477b133d049a6fb1b741377 --- /dev/null +++ b/ops/os_stat/os_stat/sysctl.h @@ -0,0 +1,10 @@ +/* + * Sysctl table + */ +#ifndef _SYSCTL_H +#define _SYSCTL_H + +extern int sysctl_table_init(void); +extern void sysctl_table_exit(void); + +#endif diff --git a/ops/os_stat/os_stat/version.h b/ops/os_stat/os_stat/version.h new file mode 100644 index 0000000000000000000000000000000000000000..16b69c86580d1222295d90b7c8efaf7065645da8 --- /dev/null +++ b/ops/os_stat/os_stat/version.h @@ -0,0 +1,30 @@ +/* + * Multi kernel versions definition + */ +#ifndef _VERSION_H +#define _VERSION_H + +#include + +/* + * Support multi kernel versions + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,6,0) +#define TK5 +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5,4,241) +#define TK4_NEW +#define TK4_NEW_NEW +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5,4,203) +#define TK4_NEW +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5,4,119) +#define TK4_OLD +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,1) +#define TK3 +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) +#define TK2 +#if LINUX_VERSION_CODE == KERNEL_VERSION(3,10,0) +#define KVM3 +#endif +#endif + +#endif diff --git a/ops/os_stat/os_stat_paremter/example.sh b/ops/os_stat/os_stat_paremter/example.sh new file mode 100644 index 0000000000000000000000000000000000000000..80e39e293682a4380be97e77b2b61c26497190d9 --- /dev/null +++ b/ops/os_stat/os_stat_paremter/example.sh @@ -0,0 +1,20 @@ +#!/bin/sh +systemctl start docker + +docker rm -f parameter_test_001 +docker rm -f parameter_test_002 +docker rm -f parameter_test_003 +docker rm -f parameter_test_004 +docker rm -f parameter_test_005 +docker rm -f parameter_test_006 +docker rm -f parameter_test_007 +docker rm -f parameter_test_008 + +docker run -itd --privileged --cpuset-cpus="0-47,192-239" --name parameter_test_001 benchmark +docker run -itd --privileged --cpuset-cpus="48-95,240-287" --name parameter_test_002 benchmark +docker run -itd --privileged --cpuset-cpus="96-143,288-335" --name parameter_test_003 benchmark +docker run -itd --privileged --cpuset-cpus="144-191,336-383" --name parameter_test_004 benchmark +docker run -itd --privileged --cpuset-cpus="384-431,576-624" --name parameter_test_005 benchmark +docker run -itd --privileged --cpuset-cpus="432-479,625-672" --name parameter_test_006 benchmark +docker run -itd --privileged --cpuset-cpus="480-528,673-720" --name parameter_test_007 benchmark +docker run -itd --privileged --cpuset-cpus="529-575,721-767" --name parameter_test_008 benchmark diff --git a/ops/os_stat/os_stat_paremter/paremter_main.py b/ops/os_stat/os_stat_paremter/paremter_main.py new file mode 100755 index 0000000000000000000000000000000000000000..cf030faa62db9fe4613cc20c149d5cc6557b12fa --- /dev/null +++ b/ops/os_stat/os_stat_paremter/paremter_main.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import time +import read_dir + +if __name__ == "__main__": + print(f"------Start to scan------ \n") + read_dir.scan_dir("/proc/sys/vm", 120) + exit() + read_dir.scan_dir("/proc/sys/fs", 5) + read_dir.scan_dir("/proc/sys/net", 5) + read_dir.scan_dir("/proc/sys/kernel", 5) + read_dir.scan_dir("/proc/sys/user", 5) + read_dir.scan_dir("/proc/sys/sunrpc", 5) + print(f"\n------Scan file finished.------\n") diff --git a/ops/os_stat/os_stat_paremter/read_dir.py b/ops/os_stat/os_stat_paremter/read_dir.py new file mode 100755 index 0000000000000000000000000000000000000000..244cf0f1a42a6878aa066553576ae2786621b0aa --- /dev/null +++ b/ops/os_stat/os_stat_paremter/read_dir.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import time +import stat +import sysbench_mysql +import sysbench_vdbench +import stat_result +import stat_result_vdbench + +counter = 0 +start = time.perf_counter() +total_length = 2 +totol_files = 1 + +def do_the_test(path, delay): + global counter + for root, dirs, files in os.walk(path, topdown=True): + if dirs: + print("-------dir:", root) + for name in files: + + fname = os.path.join(root, name) + + permission = os.stat(fname) + if not permission.st_mode & stat.S_IRUSR: + continue + if not permission.st_mode & stat.S_IWUSR: + continue + + fin = open(fname, "r+") + + try: + val = fin.read() + except OSError: + fin.close() + continue + number = int(val) + try: + number = int(val) + except ValueError: + fin.close() + continue + if number <= 1: + fin.close() + continue + delta = int(number * 5 / 10) + multi = 1 + if delta >> 7 > 10: + multi = 100 + elif delta >> 4 > 10: + multi = 10 + elif delta >> 2 > 10: + multi = 4 + number = int(number / multi) + delta = int(delta / multi) + little = number - delta + big = number + delta + for i in range(little, big): + tmp_val = i * multi + try: + fin.seek(0, 0) + fin.write(str(tmp_val)) + except OSError: + break + try: + fin.seek(0, 0) + sysctl_value = fname + " value: "+ fin.read() + except OSError: + break + #do the test + sysbench_mysql.do_the_test(0, int(delay), "mysql", sysctl_value) + sysbench_vdbench.do_the_test("vdbench", sysctl_value, str(delay)) + #stat the result + stat_result.do_the_stat() + stat_result_vdbench.do_the_stat(str(delay)) + + fin.seek(0, 0) + fin.write(val) + fin.seek(0, 0) + print("\nfile:"+fname +" default value: "+fin.read() + "\n") + fin.close() + counter += 1 + show_progress() + #for name in dirs: + # print("-------dir:", name) + return + + +def count_files_in_directory(directory, topdown=True): + total_files = 0 + for root, dirs, files in os.walk(directory): + for fname in files: + total_files += 1 + return total_files + + +def show_progress(): + processed = int(float(counter / totol_files) * total_length) + unprocess = total_length - processed + a = "*" * processed + b = "." * unprocess + c = (counter / totol_files) * 100 + dur = time.perf_counter() - start + print("\r{:^3.0f}%[{}->{}]{:.2f}s".format(c, a, b, dur), end = "") + +def scan_dir(path, delay): + global totol_files + counter = 0 + print("------Start to scan file in {path}------\n") + totol_files = count_files_in_directory(path) * 2 + do_the_test(path, delay) + print(f"\n------Scan file in {path} finished.------\n") + + +#if __name__ == "__main__": +# path = sys.argv[1] +# os.chdir(path) +# print(f"Start to scan file in {path}\n") +# totol_files = count_files_in_directory(path) * 2 +# change_file(path) +# print(f"\nScan file in {path} finished.\n") diff --git a/ops/os_stat/os_stat_paremter/record_system_info.py b/ops/os_stat/os_stat_paremter/record_system_info.py new file mode 100644 index 0000000000000000000000000000000000000000..190ab09b1b7db07a75cb1f45635b5a8fa6adf21e --- /dev/null +++ b/ops/os_stat/os_stat_paremter/record_system_info.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +def recore_max_info(file, file_append): + file_tmp = open(file, 'r') + file_max = open("./stat_vdbench_max_info_" + file_append + ".file", 'w+') + data = file_tmp.read() + file_max.write(data) + file_tmp.close() + file_max.close() + + + diff --git a/ops/os_stat/os_stat_paremter/stat_history_vdbench.py b/ops/os_stat/os_stat_paremter/stat_history_vdbench.py new file mode 100644 index 0000000000000000000000000000000000000000..5081f2b30a3a439ed304c491e8e52ce0d046dead --- /dev/null +++ b/ops/os_stat/os_stat_paremter/stat_history_vdbench.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +import os +import re +import record_system_info +from pathlib import Path + +def replace_line_in_file(line_number, new_content, file_append): + + file = open("./stat_vdbench.max_" + file_append, 'r') + + lines = file.readlines() + + file.close() + + lines[line_number] = new_content + + + file = open("./stat_vdbench.max_" + file_append, 'w') + + file.writelines(lines) + + file.close() + + +def save_max(vdbench_command, delay, max_avg, max_avg_info, max_std, max_std_info, max_max, max_max_info): + + data_max_get = open("./stat_vdbench_max_history_" + vdbench_command, 'w+') + line = -1 + item_avg = "avg_2-" + delay + item_std = "std_2-" + delay + item_max = "max_2-" + delay + for _l in data_max_get.readlines(): + _ll = _l.strip() + line += 1 + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + #item + tmpidx = _ll.find("item:") + if tmpidx == -1: + continue + max_item = _ll[tmpidx + 5: ].strip() + if max_item != item_avg and max_item != item_std and max_item != item_max: + continue + + #sys value + tmpidx = _ll.find("value:") + if tmpidx == -1: + continue + tmp_value = _ll[tmpidx + 6:].strip(); + #if int(tmp_value) != sys_value: + # continue + + #command value + tmpidx = _ll.find("vdbench") + if tmpidx == -1: + continue + eidx = _ll.find("paremater:") + if eidx == -1: + continue + tmp_command = _ll[tmpidx: eidx - 1].strip() + if tmp_command != vdbench_command: + continue + + tmpidx = _ll.find("command:") + if tmpidx == -1: + continue + qpsidx = _ll.find("qps:") + if qpsidx == -1: + continue + old_max = _ll[qpsidx + 4: tmpidx - 1].strip() + if max_item == item_avg: + score = max_avg + info = max_avg_info + if max_item == item_std : + score = max_std + info = max_std_info + if max_item != item_max : + score = max_max + info = max_max_info + if float(old_max) >= score: + continue + + new_content = "result qps:" + str(score) + info + "\n" + data_max_get.close() + replace_line_in_file(line, new_content, vdbench_command) + data_max_get = open("./stat_vdbench_max_history_" + vdbench_command, 'w+') + + data_max_get.close() + + data_max = open("./stat_vdbench_max_history_" + vdbench_command, 'a') + new_content = "\nresult qps:" + str(max_avg) + max_avg_info + "\n" + data_max.write(new_content) + new_content = "\nresult qps:" + str(max_std) + max_std_info + "\n" + data_max.write(new_content) + new_content = "\nresult qps:" + str(max_max) + max_max_info + "\n" + data_max.write(new_content) + data_max.close() + + return + +def stat_file(_f, delay): + sys_path = "" + sys_value = "" + max_avg = 0 + max_std = 0 + max_max = 0 + max_std_info = "" + max_max_info = "" + max_avg_info = "" + for root, dirs, files in os.walk("./", topdown=True): + if root != "./": + continue + for name in files: + fname = os.path.join(root, name) + if name.endswith(".swp"): + continue + if name.endswith(".py.swo"): + continue + if name.find("max") == -1: + continue + with open(fname, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip other files + if _ll.find(_f) == 0: + continue + #create files + + tmpidx = name.find("proc_") + if tmpidx == -1: + continue + + filename = name[tmpidx : ] + + tmpidx = _ll.find("command:") + if tmpidx == -1: + continue + qpsidx = _ll.find("qps:") + if qpsidx == -1: + continue + key = _ll[qpsidx + 4: tmpidx - 1].strip() + val = float(key) + + result_stat ="avg_2-" + delay + tmpidx = _ll.find(result_stat) + avg = tmpidx + std = -1 + max_2 = -1 + if (tmpidx == -1): + result_stat ="std_2-" + delay + tmpidx = _ll.find(result_stat) + std = tmpidx + if (tmpidx == -1): + result_stat ="max_2-" + delay + tmpidx = _ll.find(result_stat) + max_2 = tmpidx + if tmpidx == -1: + continue + if max_2 != -1: + if val > max_max : + max_max = val + max_max_info = _ll + continue + + if std != -1: + if val > max_std : + max_std = val + max_std_info = _ll + continue + if avg != -1: + if val > max_avg : + max_avg = val + max_avg_info = _ll + save_max(_f, delay, max_avg, max_avg_info, max_std, max_std_info, max_max, max_max_info) + + +def do_the_stat_history(delay): + + stat_file("vdbench_random_read", delay) + diff --git a/ops/os_stat/os_stat_paremter/stat_result.py b/ops/os_stat/os_stat_paremter/stat_result.py new file mode 100755 index 0000000000000000000000000000000000000000..be3fa9944ec25c19bf61128413a0d4fe9ba98705 --- /dev/null +++ b/ops/os_stat/os_stat_paremter/stat_result.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +import os +from pathlib import Path + +def replace_line_in_file(line_number, new_content): + + file = open("./stat.max", 'r') + + lines = file.readlines() + + file.close() + + lines[line_number] = new_content + + + file = open("./stat.max", 'w') + + file.writelines(lines) + + file.close() + + +def get_max(thds, score, sys_value, sys_path): + + data_max_get = open("./stat.max", 'r+') + line = -1 + for _l in data_max_get.readlines(): + _ll = _l.strip() + line += 1 + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + #sys value + tmpidx = _ll.find("value:") + if tmpidx == -1: + continue + tmp_value = _ll[tmpidx + 6:].strip(); + #if int(tmp_value) != sys_value: + # continue + + # thds + tmpidx = _ll.find("thds:") + if tmpidx == -1: + continue + eidx = _ll.find("paremater:") + if eidx == -1: + continue + tmp_thds = _ll[tmpidx + 5:eidx - 1].strip() + if int(tmp_thds) != thds: + continue + + qpsidx = _ll.find("qps:") + if qpsidx == -1: + continue + old_max = _ll[qpsidx + 4: tmpidx - 1].strip() + if float(old_max) >= score: + return + print("--------+++++++++", str(score)) + new_content = "result qps:" + str(score) + " thds:" + str(thds) + " paremater:" + sys_path + "\n" + data_max_get.close() + print("--------+++++++++", new_content) + replace_line_in_file(line, new_content) + return + data_max_get.close() + + data_max = open("./stat.max", 'a') + new_content = "\nresult qps:" + str(score) + " thds:" + str(thds) + " paremater:" + sys_path + "\n" + data_max.write(new_content) + data_max.close() + + return score + +def stat_file(_f): + data_out = open("./stat.file", 'w+') + data_max_tmp = open("./stat.max.tmp", 'w+') + sys_path = "" + sys_value = "" + threads = "" + for root, dirs, files in os.walk("./history", topdown=True): + for name in files: + fname = os.path.join(root, name) + if name.endswith(".swp"): + continue + with open(fname, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip other files + if _ll.find(_f) == 0: + continue + + tmpidx = _ll.find("queries") + if tmpidx != -1: + tmpidx = _ll.find("(") + if tmpidx == -1: + continue + eidx = _ll.find("per sec") + if tmpidx == -1: + continue + + key = _ll[tmpidx + 1 : eidx].strip() + print("result qps:" + key + " thds:" + threads + " paremater:" + sys_path + "\n", file=data_out, end=' ', flush=True) + tmp_result = get_max(int(threads), float(key), int(sys_value), sys_path) + #print("result qps:" + str(tmp_result) + " thds:" + threads + " paremater:" + sys_path + "\n", file=data_max_tmp, end=' ', flush=True) + continue + + tmpidx = _ll.find("file:") + if tmpidx != -1: + sys_path = _ll[tmpidx:].strip() + tmpidx = _ll.find("value:") + if tmpidx != -1: + sys_value = _ll[tmpidx + 6:].strip(); + continue + + tmpidx = _ll.find("thds:") + if tmpidx == -1: + continue + eidx = _ll.find("tps:") + threads = _ll[tmpidx + 5:eidx].strip() + + data_max_tmp.close() + data_out.close() + +def do_the_stat(): + file = open("./stat.file", 'a') + file.close() + + file = open("./stat.max", 'a') + file.close() + + stat_file("mysql") + + #os.unlink() diff --git a/ops/os_stat/os_stat_paremter/stat_result_vdbench.py b/ops/os_stat/os_stat_paremter/stat_result_vdbench.py new file mode 100755 index 0000000000000000000000000000000000000000..5643dabbec005711c69a7fc37d77d649ca00bd4d --- /dev/null +++ b/ops/os_stat/os_stat_paremter/stat_result_vdbench.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +import os +import re +import record_system_info +import stat_history_vdbench +from pathlib import Path + +def replace_line_in_file(line_number, new_content, file_append): + + file = open("./stat_vdbench.max_" + file_append, 'r') + + lines = file.readlines() + + file.close() + + lines[line_number] = new_content + + + file = open("./stat_vdbench.max_" + file_append, 'w') + + file.writelines(lines) + + file.close() + + +def get_max(score, sys_value, sys_path, vdbench_command, item): + + tmp = sys_path + file_append = "" + tmpidx = tmp.find("value:") + if tmpidx != -1: + file = tmp[ 6: tmpidx - 1].strip() + file_append = re.sub("/", "_", file) + file = open("./stat_vdbench.max_" + file_append, 'a') + file.close() + + data_max_get = open("./stat_vdbench.max_" + file_append, 'r+') + line = -1 + for _l in data_max_get.readlines(): + _ll = _l.strip() + line += 1 + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + #item + tmpidx = _ll.find("item:") + if tmpidx == -1: + continue + max_item = _ll[tmpidx + 5: ].strip() + if max_item != item: + continue + + #sys value + tmpidx = _ll.find("value:") + if tmpidx == -1: + continue + tmp_value = _ll[tmpidx + 6:].strip(); + #if int(tmp_value) != sys_value: + # continue + + #command value + tmpidx = _ll.find("vdbench") + if tmpidx == -1: + continue + eidx = _ll.find("paremater:") + if eidx == -1: + continue + tmp_command = _ll[tmpidx: eidx - 1].strip() + if tmp_command != vdbench_command: + continue + + tmpidx = _ll.find("command:") + if tmpidx == -1: + continue + qpsidx = _ll.find("qps:") + if qpsidx == -1: + continue + old_max = _ll[qpsidx + 4: tmpidx - 1].strip() + if float(old_max) >= score: + return False + new_content = "result qps:" + str(score) + " command:" + vdbench_command + " paremater:" + sys_path + " item:" + item + "\n" + data_max_get.close() + replace_line_in_file(line, new_content, file_append) + return True + data_max_get.close() + + data_max = open("./stat_vdbench.max_" + file_append, 'a') + new_content = "\nresult qps:" + str(score) + " command:" + vdbench_command + " paremater:" + sys_path + " item:" + item + "\n" + data_max.write(new_content) + data_max.close() + + return True + +def stat_file(_f, delay): + data_out = open("./stat_vdbench.file", 'w+') + sys_path = "" + sys_value = "" + for root, dirs, files in os.walk("./history_vdbench", topdown=True): + for name in files: + fname = os.path.join(root, name) + if name.endswith(".swp"): + continue + with open(fname, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip other files + if _ll.find(_f) == 0: + continue + #create files + + tmpidx = _ll.find("file:") + if tmpidx != -1: + sys_path = _ll[tmpidx:].strip() + tmpidx = _ll.find("value:") + if tmpidx != -1: + sys_value = _ll[tmpidx + 6:].strip(); + continue + + tmpidx = name.find("vdbench") + if tmpidx == -1: + continue + filename = name[tmpidx:] + if filename == "vdbench_create_files" or filename == "vdbench_delete_files": + result_stat ="avg_2-" + delay + tmpidx = _ll.find(result_stat) + avg = tmpidx + if (tmpidx == -1): + result_stat ="std_2-" + delay + tmpidx = _ll.find(result_stat) + if (tmpidx == -1): + result_stat ="max_2-" + delay + tmpidx = _ll.find(result_stat) + if tmpidx != -1: + key = _ll[tmpidx + len(result_stat) + 1 : ].strip() + black_space_idx = key.find(" ") + key = key[ : black_space_idx] + print("result " + result_stat + ":" + key + " command:" + filename + " paremater:" + sys_path + "\n", file=data_out, end=' ', flush=True) + tmp_result = get_max(float(key), int(sys_value), sys_path, filename, result_stat) + if avg != -1 and tmp_result == True: + file_append = "" + tmp = sys_path + fileidx = tmp.find("value:") + if fileidx != -1: + file = tmp[ 6 : fileidx - 1].strip() + file_append = re.sub("/", "_", file) + record_system_info.recore_max_info(fname + "_info_max", file_append) + #print("result qps:" + str(tmp_result) + " thds:" + threads + " paremater:" + sys_path + "\n", file=data_max_tmp, end=' ', flush=True) + continue + + result_stat ="avg_2-" + delay + tmpidx = _ll.find(result_stat) + avg = tmpidx + if (tmpidx == -1): + result_stat ="std_2-" + delay + tmpidx = _ll.find(result_stat) + if (tmpidx == -1): + result_stat ="max_2-" + delay + tmpidx = _ll.find(result_stat) + if tmpidx != -1: + key = _ll[tmpidx + len(result_stat) + 1 : ].strip() + black_space_idx = key.find(" ") + key = key[ : black_space_idx] + print("result " + result_stat + ":" + key + " command:" + filename + " paremater:" + sys_path + "\n", file=data_out, end=' ', flush=True) + tmp_result = get_max(float(key), int(sys_value), sys_path, filename, result_stat) + if avg != -1 and tmp_result == True: + file_append = "" + tmp = sys_path + fileidx = tmp.find("value:") + if fileidx != -1: + file = tmp[ 6 : fileidx - 1].strip() + file_append = re.sub("/", "_", file) + record_system_info.recore_max_info(fname + "_info_max", file_append) + #print("result qps:" + str(tmp_result) + " thds:" + threads + " paremater:" + sys_path + "\n", file=data_max_tmp, end=' ', flush=True) + continue + + data_out.close() + +def do_the_stat(delay): + + file = open("./stat_vdbench.file", 'a') + file.close() + + file = open("./stat_vdbench_all.file", 'a') + file.close() + + stat_file("vdbench", delay) + + stat_history_vdbench.do_the_stat_history(delay) + + file_tmp = open("./stat_vdbench.file", 'r') + file_all = open("./stat_vdbench_all.file", 'a+') + data = file_tmp.read() + file_all.write(data) + file_tmp.close() + file_all.close() + + #os.unlink() diff --git a/ops/os_stat/os_stat_paremter/sysbench_mysql.py b/ops/os_stat/os_stat_paremter/sysbench_mysql.py new file mode 100755 index 0000000000000000000000000000000000000000..eb6c04ff77e5e32c4050b1cd2c9bd4b1612534dd --- /dev/null +++ b/ops/os_stat/os_stat_paremter/sysbench_mysql.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import os +import subprocess +import sys +import threading +import shutil + +def thread(docker, num, delay, file): + print("delay: start", delay) + + command="docker exec -it --privileged %s sysbench --db-driver=mysql \ + --mysql-host=localhost --mysql-port=3306 --mysql-user=root --mysql-db=testdb128 --tables=150 \ + --events=0 --time=%d --threads=%d --percentile=95 --report-interval=1 oltp_read_only run >> %s; \ + " % (docker, int(delay), num, file) + + #os.system(command) + result = subprocess.run(command, shell=True, text=True, capture_output=True) + + print("delay: end", delay) + +def do_the_test(num, delay, benchmakr_name, sysctl_value): + threads = [] + docker = ["parameter_test_001", "parameter_test_002", "parameter_test_003"] + + src_dir = "./tmp" + dest_dir = "./history" + + isExists=os.path.exists(src_dir) + if not isExists: + os.makedirs(src_dir) + isExists=os.path.exists(dest_dir) + if not isExists: + os.makedirs(dest_dir) + + for i in range(3): + benchmark_file = "./tmp/%d.%s" % ((i + 1), benchmakr_name) + with open(benchmark_file, 'w') as file: + print("\nfile:"+ sysctl_value + "\n", file=file, end=' ', flush=True) + t = threading.Thread(target=thread, args=(docker[i], (1 << i)* 8, delay, benchmark_file)) + threads.append(t) + t.start() + + # 等待所有线程执行完毕 + for t in threads: + t.join() + + for i in range(3): + src_file = "./tmp/%d.%s" % ((i + 1), benchmakr_name) + dest_file = "./history/%d.%s" % ((i + 1), benchmakr_name) + shutil.move(src_file, dest_file) + + print("All threads are finished.") + diff --git a/ops/os_stat/os_stat_paremter/sysbench_vdbench.py b/ops/os_stat/os_stat_paremter/sysbench_vdbench.py new file mode 100755 index 0000000000000000000000000000000000000000..de0387b57d7538e59967296e189bb2fa2b750fc4 --- /dev/null +++ b/ops/os_stat/os_stat_paremter/sysbench_vdbench.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import os +import subprocess +import sys +import threading +import shutil + +def start_os_stat(time, file): + command = "oc-ops os_stat -fg 2 -de %d >> %s \ + " % (int(time), file) + #os.system(command) + result = subprocess.run(command, shell=True, text=True, capture_output=True) + +def thread(docker, command, time, file): + #print("test: start", command) + + command1 = "docker exec -it --privileged %s yum install kernel-devel-6.6.47-12.tl4.x86_64 -y \ + " % (docker) + result = subprocess.run(command1, shell=True, text=True, capture_output=True) + command2 = "docker exec -it --privileged %s yum install opencloudos-tools -y \ + " % (docker) + result = subprocess.run(command2, shell=True, text=True, capture_output=True) + command3 = "docker exec -it --privileged %s yum install kmod -y \ + " % (docker) + result = subprocess.run(command3, shell=True, text=True, capture_output=True) + + command = "docker exec -it --privileged %s /data/vdbench50407a_v10/vdbench -f \ + /data/vdbench50407a_v10/examples/filesys/%s -e %d -o test/ >> %s \ + " % (docker, command, int(time), file) + #os.system(command) + result = subprocess.run(command, shell=True, text=True, capture_output=True) + + +def do_the_test(benchmakr_name, sysctl_value, time): + threads = [] + docker = ["parameter_test_001", "parameter_test_002", "parameter_test_003", "parameter_test_004", "parameter_test_005", \ + "parameter_test_006", "parameter_test_007", "parameter_test_008"] + command = ["create_files", "delete_files", "random_read", "random_rw", "random_write", \ + "seq_read", "seq_rw", "seq_write"] + + src_dir = "./tmp_vdbench" + dest_dir = "./history_vdbench" + + isExists=os.path.exists(src_dir) + if not isExists: + os.makedirs(src_dir) + isExists=os.path.exists(dest_dir) + if not isExists: + os.makedirs(dest_dir) + + for i in range(8): + benchmark_file = "./tmp_vdbench/%d.%s_%s" % ((i + 1), benchmakr_name, command[i]) + with open(benchmark_file, 'w+') as file: + print("\nfile:"+ sysctl_value + "\n", file=file, end=' ', flush=True) + t = threading.Thread(target=thread, args=(docker[i], command[i], time, benchmark_file)) + threads.append(t) + t.start() + + benchmark_file1 = "./tmp_vdbench/%d.%s_%s_info_max" % ((i + 1), benchmakr_name, command[i]) + with open(benchmark_file1, 'w+') as file: + print("\nfile:"+ sysctl_value + "\n", file=file, end=' ', flush=True) + t = threading.Thread(target=start_os_stat, args=(time, benchmark_file1)) + threads.append(t) + t.start() + + # 等待所有线程执行完毕 + for t in threads: + t.join() + + for i in range(8): + src_file = "./tmp_vdbench/%d.%s_%s" % ((i + 1), benchmakr_name, command[i]) + dest_file = "./history_vdbench/%d.%s_%s" % ((i + 1), benchmakr_name, command[i]) + shutil.move(src_file, dest_file) + src_file = benchmark_file1 + file = open(src_file, 'r') + data = file.read() + for i in range(8): + dest_file = "./history_vdbench/%d.%s_%s_info_max" % ((i + 1), benchmakr_name, command[i]) + dest = open(dest_file, 'w+') + dest.write(data) + dest.close() + file.close() + #print("All threads are finished.") diff --git a/ops/os_stat/os_stat_ptrace/README.md b/ops/os_stat/os_stat_ptrace/README.md new file mode 100644 index 0000000000000000000000000000000000000000..197ac99eccbf8a38002c9d4d33bcdb9254b0f2d8 --- /dev/null +++ b/ops/os_stat/os_stat_ptrace/README.md @@ -0,0 +1,17 @@ +1. ./make.sh + +2. one terminal:start test program: +#./test + +3.the other terminal: run inject program: $pid is pid of test + +#./ptrace $pid + +4. at the "test" terminal: see the inject code:"now, inject begin" +" +hello +now, inject begin: +hello +hello +" + diff --git a/ops/os_stat/os_stat_ptrace/make.sh b/ops/os_stat/os_stat_ptrace/make.sh new file mode 100755 index 0000000000000000000000000000000000000000..f0f3873c047c63a1216d207805ec27ae89051970 --- /dev/null +++ b/ops/os_stat/os_stat_ptrace/make.sh @@ -0,0 +1,3 @@ +gcc -o test test.c +gcc -o ptrace ptrace.c + diff --git a/ops/os_stat/os_stat_ptrace/ptrace.c b/ops/os_stat/os_stat_ptrace/ptrace.c new file mode 100644 index 0000000000000000000000000000000000000000..57568fcd618c70e81e37c131954ecd7523121f9f --- /dev/null +++ b/ops/os_stat/os_stat_ptrace/ptrace.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +pid_t hook_pid = 0; +void show_reg(const char* name, struct user_regs_struct* regs) +{ + fprintf(stderr, "===========================\n"); + fprintf(stderr, "[%s]==> r15: %llx\n", name, regs->r15); + fprintf(stderr, "[%s]==> r14: %llx\n", name, regs->r14); + fprintf(stderr, "[%s]==> r13: %llx\n", name, regs->r13); + fprintf(stderr, "[%s]==> r12: %llx\n", name, regs->r12); + fprintf(stderr, "[%s]==> rbp: %llx\n", name, regs->rbp); + fprintf(stderr, "[%s]==> rbx: %llx\n", name, regs->rbx); + fprintf(stderr, "[%s]==> r11: %llx\n", name, regs->r11); + fprintf(stderr, "[%s]==> r10: %llx\n", name, regs->r10); + fprintf(stderr, "[%s]==> r9: %llx\n", name, regs->r9); + fprintf(stderr, "[%s]==> r8: %llx\n", name, regs->r8); + fprintf(stderr, "[%s]==> rax: %llx\n", name, regs->rax); + fprintf(stderr, "[%s]==> rcx: %llx\n", name, regs->rcx); + fprintf(stderr, "[%s]==> rdx: %llx\n", name, regs->rdx); + fprintf(stderr, "[%s]==> rsi: %llx\n", name, regs->rsi); + fprintf(stderr, "[%s]==> rdi: %llx\n", name, regs->rdi); + fprintf(stderr, "[%s]==> orig_rax: %llx\n", name, regs->orig_rax); + fprintf(stderr, "[%s]==> rip: %llx\n", name, regs->rip); + fprintf(stderr, "[%s]==> cs: %llx\n", name, regs->cs); + fprintf(stderr, "[%s]==> eflags: %llx\n", name, regs->eflags); + fprintf(stderr, "[%s]==> rsp: %llx\n", name, regs->rsp); + fprintf(stderr, "[%s]==> ss: %llx\n", name, regs->ss); + fprintf(stderr, "[%s]==> fs_base: %llx\n", name, regs->fs_base); + fprintf(stderr, "[%s]==> gs_base: %llx\n", name, regs->gs_base); + fprintf(stderr, "[%s]==> ds: %llx\n", name, regs->ds); + fprintf(stderr, "[%s]==> es: %llx\n", name, regs->es); + fprintf(stderr, "[%s]==> fs: %llx\n", name, regs->fs); + fprintf(stderr, "[%s]==> gs: %llx\n", name, regs->gs); + fprintf(stderr, "===========================\n"); +} +const int long_size = sizeof(long); +/*PTRACE_PEEKDATA, get data*/ +void getdata(pid_t child, long addr, char *str, int len) +{ + long data; + int i, j; + char *pos = str; + void *src = (void *)addr; + + i = 0; + j = len / sizeof(data); + + while(i < j) { + data = ptrace(PTRACE_PEEKTEXT, child, + (void *)src, NULL); + memcpy(pos, &data, sizeof(data)); + ++i; + src += sizeof(data); + pos += sizeof(data); + } + j = len % sizeof(data); + if(j != 0) { + data = ptrace(PTRACE_PEEKTEXT, child, + src, NULL); + memcpy(pos, &data, j); + } +} + +/*PTRACE_POKEDATA, inject data*/ +void putdata(pid_t child, void *addr, char *str, int len) +{ + long data = 0; + int i, j; + int ret; + void *dst = addr; + + i = 0; + j = len / sizeof(data); + while(i < j) { + ret = ptrace(PTRACE_POKETEXT, child, + dst, *(long *)(str + i * sizeof(data))); + dst += sizeof(data); + ++i; + } + + j = len % sizeof(data); + if(j != 0) { + data = ptrace(PTRACE_PEEKTEXT, child, + dst, NULL); + memcpy((void *)(&data), str + i * sizeof(data), j); + ret = ptrace(PTRACE_POKETEXT, child, + dst, data); + } +} +int check_status(pid_t pid, const char* message) +{ + int status = 0; + pid_t pid_result = waitpid(pid, &status, __WALL); + if (pid_result < 0) + { + printf("Failed to wait for PID %d: %s\n", pid, strerror(errno)); + return 1; + } + + if (pid_result == 0) + { + printf("pid_result == 0\n"); + return 0; + } + + printf( "[%s] pid_result: %d, attach pid: %d\n", message, pid_result, pid); + if (WIFEXITED(status)) + { + printf("[%s] exited, status=%d\n", message, WEXITSTATUS(status)); + return 1; + } + else if (WIFSIGNALED(status)) + { + printf("[%s] killed by signal %d\n", message, WTERMSIG(status)); + } + else if (WIFSTOPPED(status)) + { + // signal-5: SIGTRAP + // signal-11: SIGSEGV + // signal-19: SIGSTOP + printf( "[%s] stopped by signal %d\n", message, WSTOPSIG(status)); + } + else if (WIFCONTINUED(status)) + { + printf("[%s] continued\n", message); + } + else if (WCOREDUMP(status)) + { + printf("[%s] core dumped.\n", message); + } + + printf("[%s] wait end\n", message); + return 0; +} +void sig_process(int sig_no) +{ + fprintf(stderr, "signal: %d has been received\n", sig_no); + + int ret = ptrace(PTRACE_DETACH, hook_pid, + NULL, NULL); + if (ret != 0) + { + fprintf(stderr, "ptrace_detach error, return: %d\n", ret); + } + + _exit(0); +} + +static __always_inline volatile void *inject_mmap( + void *addr, + uint64_t length, + uint64_t port, + uint64_t flags, + uint64_t fd, + uint64_t offset) +{ + long mmap_fd = fd; + unsigned long mmap_off = offset; + unsigned long mmap_flag = flags; + unsigned long ret; + __asm__ volatile( + "mov %0, %%rdi\n" + "mov %1, %%rsi\n" + "mov %2, %%rdx\n" + "mov %3, %%r10\n" + "mov %4, %%r8\n" + "mov %5, %%r9\n" + "mov $0x9, %%rax\n" + "syscall" + : + : "g"(addr), "g"(length), "g"(port), "g"(flags), "g"(fd), "g"(offset)); + asm("mov %%rax, %0" + : "=r"(ret)); + return (void *)ret; +} +__always_inline long inject_write(long fd, char *buf, unsigned long len) +{ + long ret; + asm volatile ( + "mov %0, %%rdi\n" + "mov %1, %%rsi\n" + "mov %2, %%rdx\n" + "mov $1, %%rax\n" + "syscall" + : + :"g"(fd), "g"(buf), "g"(len) + ); + asm("mov %%rax, %0":"=r"(ret)); + return ret; +} +// Injection code run for create engouth memeory to insert the sepecify elf file. +void injection_function(void * addr)//void * addr +{ + void *ret; + + addr = (void *)0x100000; + // Apply for enought memory to inject the elf_program. + char str[] = {"now, inject begin:\n"}; + + ret = inject_mmap((void *)addr, 8192, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + if (ret == MAP_FAILED) { + asm volatile ( + "mov $60, %%rax\n" + "syscall" + :: + ); + }else { + inject_write(1, str, sizeof(str)); + } + // Soft breakpoint + asm volatile ("int3"); +} +void inject_func_end(){} + +unsigned long get_size() +{ + return (unsigned long)((char *)inject_func_end - (char *)injection_function); +} + +void *injection_code() +{ + void *p_injection; + p_injection = malloc(get_size()); + memcpy(p_injection, injection_function, get_size()); + + return p_injection; +} +int main(int argc, char **argv) +{ + int traced_process = -1, ret, i; + struct user_regs_struct regs; + struct user_regs_struct old_regs; + char insertcode[] = + "\xeb\x15\x5e\xb8\x04\x00" + "\x00\x00\xbb\x02\x00\x00\x00\x89\xf1\xba" + "\x0c\x00\x00\x00\xcd\x80\xcc\xe8\xe6\xff" + "\xff\xff\x48\x65\x6c\x6c\x6f\x20\x57\x6f" + "\x72\x6c\x64\x0a\x00"; + int len = sizeof(insertcode); + char *backup; + unsigned long addr; + char test[12]; + void *p_injection; + + if(argc >= 2) + traced_process = atoi(argv[1]); + signal(SIGINT, sig_process); + hook_pid = traced_process; + + p_injection = injection_code(); + + ret = ptrace(PTRACE_ATTACH, traced_process, NULL, NULL); + if (ret < 0) { + printf("attach error\n"); + return 0; + } + + wait(NULL); // 等待目标进程暂停 + memset(®s, 0, sizeof(struct user_regs_struct)); + ret = ptrace(PTRACE_GETREGS, traced_process, NULL, ®s); + show_reg("saved", ®s); + memcpy(&old_regs, ®s, sizeof(regs)); + + len = get_size(); + if (len < sizeof(insertcode)) + len = sizeof(insertcode); + backup = malloc(len); + + getdata(traced_process, regs.rip, backup, len); + putdata(traced_process, regs.rip, p_injection, len); + //putdata(traced_process, regs.rip, insertcode, sizeof(insertcode)); + + ret = ptrace(PTRACE_SETREGS, traced_process, NULL, ®s); + ret = ptrace(PTRACE_CONT, traced_process, NULL, 0); + check_status(traced_process, "ptrace_cont"); + + printf("The process stopped, putting back " + "the original instructions \n"); + printf("Press to continue "); + getchar(); + + putdata(traced_process, old_regs.rip, backup, len); + ptrace(PTRACE_SETREGS, traced_process, + NULL, &old_regs); + printf("Letting it continue with " + "original flow "); + + ptrace(PTRACE_DETACH, traced_process, + NULL, NULL); + + free(p_injection); + return 0; +} diff --git a/ops/os_stat/os_stat_ptrace/test.c b/ops/os_stat/os_stat_ptrace/test.c new file mode 100644 index 0000000000000000000000000000000000000000..e84e355dea044ee6658177f2ad383f373d6d3fa8 --- /dev/null +++ b/ops/os_stat/os_stat_ptrace/test.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include + +int main() +{ + int i = 0; + printf("main addr : 0x%lx\n", (u_int64_t)main); + printf("Please inject me\n"); + for(i = 0; i < 99999; i++){ + printf("hello\n"); + sleep(1); + } + return 0; +} diff --git a/ops/os_stat/os_stat_show_parameter/compute.sh b/ops/os_stat/os_stat_show_parameter/compute.sh new file mode 100755 index 0000000000000000000000000000000000000000..b1b0487aab55e3c1a7bd832342197cc671fd68db --- /dev/null +++ b/ops/os_stat/os_stat_show_parameter/compute.sh @@ -0,0 +1,166 @@ +#!/bin/sh + +function get_offset() +{ + pahole -C $1 $2 | grep $3; +} +function helper() +{ + echo -e "\033[32m-s, --struct: "show struct, such as struct file, uses -s file .etc"\033[0m" + echo -e "\033[32m-a, --parameter: \"show parameter name in struct .etc\" \033[0m" + echo -e "\033[32m-i, --pointer: \"show pointer member pos in var, if no pointer member, not assign\"\033[0m" + echo -e "\033[32m-t, --type: \"show member type, such as int/long/char \"\033[0m" + echo -e "\033[32m-v, --vmlinux: \"linux vmlinux\"\033[0m" + echo -e "\033[32m-w, --which: \"to show which parameter of function\"\033[0m" + echo -e "\033[32m-de, --delay: \"duration, default 1(s)\"\033[0m" + echo -e "\033[32m-p, --proc: \"default:null. if assigned, only trace the process\"\033[0m" + echo such as: + echo -e "\033[32m1.show parameter of kernel function: 1st\): show \"file-\>f_path.dentry-\>d_iname\" of vfs_read 2nd\):show 3rd parameter: \"count\" of vfs_read\"\033[0m" + echo -e "\033[32m (1) t-ops os_stat -fg 5 -s file -s path -s dentry -a f_path -a dentry -a d_iname -i 3 -t char -f vfs_read -v /boot/vmlinux-$(uname -r) \033[0m" + echo -e "\033[32m (2) t-ops os_stat -fg 5 -w 3 -t long -f vfs_read -v /boot/vmlinux-$(uname -r) \033[0m" +} +#main +function main() +{ + PARAMETER={} + STRUCT={} + POINTER={} + FUNCTION="" + TYPE="" + PROC="" + DELAY=0 + i=0 + j=0 + k=0 + m=0 + order=0 + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -v|--vmlinux) + VIMLINUX="$2" + shift # past argument + shift # past value + ;; + -a|--parameter) + PARAMETER[i]="$2" + i=$(($i+1)) + shift # past argument + shift # past value + ;; + -s|--struct) + STRUCT[j]="$2" + j=$(($j+1)) + shift # past argument + shift # past value + ;; + -w|--which) + order="$2" + shift # past argument + shift # past value + ;; + -i|--pointer) + POINTER[k]="$2" + k=$(($k+1)) + shift # past argument + shift # past value + ;; + -f|--function) + FUNCTION="$2" + shift # past argument + shift # past value + ;; + -t|--type) + TYPE="$2" + shift # past argument + shift # past value + ;; + -de|--delay) + DELAY=$2 + shift # past argument + shift # past value + ;; + -p|--proc) + PROC="$2" + shift # past argument + shift # past value + ;; + -h|--help) + helper + return + shift # past argument + ;; + *) + POSITIONAL+=("$1") # save it in an array for later + shift # past argument + ;; + esac + done + + offset={} + addr={} + + if [ ! -f "$VIMLINUX" ]; then + printf "\033[31m $VIMLINUX is not exist \033[0m \n" + exit 1 + fi + for(( j=0; j<$i; j++ )) do + + get_offset ${STRUCT[j]} $VIMLINUX ${PARAMETER[j]} &> data.txt + offset[j]=$(./get_offset.py data.txt) + + done + if [ $k -gt 0 ]; then + for(( j=0; j < $k; j++ )) do + addr[j]=0 + if [ $j == 0 ]; then + m=0 + else : + m=$((${POINTER[j]} - 1)) + fi + for(( ; m < ${POINTER[j]} - 1; m++ )) do + addr[j]=$((${addr[j]}+${offset[m]})) + done + done + if [ $k -gt 0 ]; then + k=$(($k - 1)) + for(( m=$((${POINTER[k]} - 1)); m < $i; m++ )) do + addr[j]=$((${addr[j]}+${offset[m]})) + j=$(($j + 1)) + done + fi + else : + addr[0]=${offset[0]} + fi + j=$(($j - 1)) + + for (( i = 0; i <= j; i++)) do + echo ${addr[i]} > /proc/sys/os_aware/var_offset$(($i+1)) + done + if [ ! -z "$PROC" ]; then + echo $PROC > /proc/sys/os_aware/proc_comm + fi + echo $order > /proc/sys/os_aware/var_which + echo 1 > /proc/sys/os_aware/var_offset_enable + echo $TYPE > /proc/sys/os_aware/show_parameter_type + echo $FUNCTION > /proc/sys/os_aware/ftrace_hook_function + i=0 + delta=0.01 + cycle=100 + if [ $DELAY -gt 1 ]; then + cycle=$(($DELAY * 100)) + fi + sleep 0.2 + while [[ $i -lt $cycle ]]; do + sleep $delta + cat /proc/sys/os_aware/show_parameter + i=$(($i + 1)) + done + echo 0 > /proc/sys/os_aware/var_offset_enable + echo " " > /proc/sys/os_aware/show_parameter_type + echo 0 > /proc/sys/os_aware/var_which + echo $FUNCTION > /proc/sys/os_aware/ftrace_unhook_function + +} + +main $* diff --git a/ops/os_stat/os_stat_show_parameter/get_offset.py b/ops/os_stat/os_stat_show_parameter/get_offset.py new file mode 100755 index 0000000000000000000000000000000000000000..9969f59fbe84662e9db1b33af8a5081e54dac8b0 --- /dev/null +++ b/ops/os_stat/os_stat_show_parameter/get_offset.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import sys +import os + +def parse_conf(_f): + _ret = dict() + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + eidx = _ll.find("/*") + if eidx == -1: + continue + sub = _ll[eidx+2 : ].strip() + eidx = sub.find(" ") + if eidx == -1: + continue + val = sub[ : eidx].strip() + + print("{:<40}".format(val)) + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + + _kv1 = parse_conf(_f1); diff --git a/ops/os_stat/os_stat_uprobe/get_offset.py b/ops/os_stat/os_stat_uprobe/get_offset.py new file mode 100755 index 0000000000000000000000000000000000000000..3cfbce771ba4c164232534690615d94c7a8b707b --- /dev/null +++ b/ops/os_stat/os_stat_uprobe/get_offset.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import sys +import os + +def parse_conf(_f, _func): + _ret = dict() + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + eidx = _ll.find(":") + if eidx == -1: + continue + sub = _ll[eidx : ].strip() + eidx = sub.find(_func) + if eidx != -1: + continue + + eidx = _ll.find("<") + if eidx == -1: + continue + val = _ll[ : eidx - 1].strip() + val = val.lstrip("0") + + print("{:<40}".format(val)) + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _func = sys.argv[2]; + + _kv1 = parse_conf(_f1, _func); diff --git a/ops/os_stat/os_stat_uprobe/uprobe.bpf.c b/ops/os_stat/os_stat_uprobe/uprobe.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..f95d470b3d850fe405fc807cd6e5c4d8df45dbde --- /dev/null +++ b/ops/os_stat/os_stat_uprobe/uprobe.bpf.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +const volatile __u64 func_addr = 0; +const volatile __u64 func_addr_test; +SEC("uprobe") +int BPF_KPROBE(uprobe_add, int a, int b) +{ + unsigned long user; + bpf_printk("uprobed_add ENTRY: a = %d, b = %d, %lx\n", a, b, ctx); + bpf_printk("uprobed_add ENTRY: %lx %lx, %lx\n", func_addr, func_addr_test, PT_REGS_RET(ctx)); + return 0; +} + +SEC("uretprobe") +int BPF_KRETPROBE(uretprobe_add, int ret) +{ + bpf_printk("uprobed_add EXIT: return = %d\n", ret); + return 0; +} +#if 0 +SEC("uprobe//proc/self/exe:uprobed_sub") +int BPF_KPROBE(uprobe_sub, int a, int b) +{ + bpf_printk("uprobed_sub ENTRY: a = %d, b = %d\n", a, b); + return 0; +} + +SEC("uretprobe//proc/self/exe:uprobed_sub") +int BPF_KRETPROBE(uretprobe_sub, int ret) +{ + bpf_printk("uprobed_sub EXIT: return = %d\n", ret); + return 0; +} +#endif diff --git a/ops/os_stat/os_stat_uprobe/uprobe.c b/ops/os_stat/os_stat_uprobe/uprobe.c new file mode 100644 index 0000000000000000000000000000000000000000..849b455087e730568fbb8dc5fb1f08f58dd46d7f --- /dev/null +++ b/ops/os_stat/os_stat_uprobe/uprobe.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include "uprobe.skel.h" + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} + +/* + * Taken from https://github.com/torvalds/linux/blob/9b59ec8d50a1f28747ceff9a4f39af5deba9540e/tools/testing/selftests/bpf/trace_helpers.c#L149-L205 + * + * See discussion in https://github.com/libbpf/libbpf-bootstrap/pull/90 + */ +ssize_t get_uprobe_offset(const void *addr, int pid) +{ + size_t start, end, base; + char buf[256]; + bool found = false; + FILE *f; + char maps[64]; + + if (pid > -1) + sprintf(maps, "/proc/%d/maps", pid); + else + sprintf(maps, "/proc/self/maps"); + + f = fopen(maps, "r"); + if (!f) { + printf("-----open file error:%s\n", maps); + return -errno; + } + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } + } + + fclose(f); + + if (!found) + return -ESRCH; + + return (uintptr_t)addr - start + base; +} +int new_uprobed_add(int a, int b) +{ + a = 0x55aa; + b = 0x55aa; + return a + b; +} + +/* It's a global function to make sure compiler doesn't inline it. */ +int uprobed_add(int a, int b) +{ + return a + b; +} + +int uprobed_sub(int a, int b) +{ + return a - b; +} + +int main(int argc, char **argv) +{ + struct uprobe_bpf *skel; + long uprobe_offset = 0; + int err, i, ret, pid = -1; + int hook = 1; + unsigned long offset = 0; + char exe[64]; + + if(argc >= 2) + hook = atoi(argv[1]); + if (argc >= 3) + pid = atoi(argv[2]); + if (argc >= 4) + offset = strtol(argv[3], argv[4], 16); + + if (hook == 0) + goto to_run; + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Load and verify BPF application */ + skel = uprobe_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return 1; + } + skel->rodata->func_addr = get_uprobe_offset(&new_uprobed_add, pid); + skel->rodata->func_addr_test = (unsigned long)new_uprobed_add; + ret = uprobe_bpf__load(skel); + if (ret) { + fprintf(stderr, "failed to load bpf object\n"); + return 1; + } + /* uprobe/uretprobe expects relative offset of the function to attach + * to. This offset is relateve to the process's base load address. So + * easy way to do this is to take an absolute address of the desired + * function and substract base load address from it. If we were to + * parse ELF to calculate this function, we'd need to add .text + * section offset and function's offset within .text ELF section. + */ + + /* Attach tracepoint handler */ + if (offset > 0) + uprobe_offset = get_uprobe_offset((void *)offset, pid); + else + /* for test only */ + uprobe_offset = get_uprobe_offset(&uprobed_add, pid); + sprintf(exe, "/proc/%d/exe", pid); + skel->links.uprobe_add = bpf_program__attach_uprobe(skel->progs.uprobe_add, + false /* not uretprobe */, + pid /* self pid */, + exe, + uprobe_offset); + if (!skel->links.uprobe_add) { + err = -errno; + fprintf(stderr, "Failed to attach uprobe: %d\n", err); + goto cleanup; + } + + /* we can also attach uprobe/uretprobe to any existing or future + * processes that use the same binary executable; to do that we need + * to specify -1 as PID, as we do here + */ + skel->links.uretprobe_add = bpf_program__attach_uprobe(skel->progs.uretprobe_add, + true /* uretprobe */, + -1 /* any pid */, + exe, + uprobe_offset); + if (!skel->links.uretprobe_add) { + err = -errno; + fprintf(stderr, "Failed to attach uprobe: %d\n", err); + goto cleanup; + } + + /* Let libbpf perform auto-attach for uprobe_sub/uretprobe_sub + * NOTICE: we provide path and symbol info in SEC for BPF programs + */ + err = uprobe_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to auto-attach BPF skeleton: %d\n", err); + goto cleanup; + } + + printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " + "to see output of the BPF programs.\n"); +to_run: + for (i = 0; ; i++) { + /* trigger our BPF programs */ + /* hook = 0: for test only */ + if (hook == 0) { + fprintf(stderr, "."); + uprobed_add(i, i + 1); + uprobed_sub(i * i, i); + } + sleep(1); + } + +cleanup: + if (hook == 1) + uprobe_bpf__destroy(skel); + return -err; +} diff --git a/ops/os_stat/os_stat_uprobe/uprobe.skel.h b/ops/os_stat/os_stat_uprobe/uprobe.skel.h new file mode 100644 index 0000000000000000000000000000000000000000..3e1e0d231221474a815b979e092ff255505017b9 --- /dev/null +++ b/ops/os_stat/os_stat_uprobe/uprobe.skel.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __UPROBE_BPF_SKEL_H__ +#define __UPROBE_BPF_SKEL_H__ + +#include +#include + +struct uprobe_bpf { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + } maps; + struct { + struct bpf_program *uprobe_add; + struct bpf_program *uretprobe_add; + } progs; + struct { + struct bpf_link *uprobe_add; + struct bpf_link *uretprobe_add; + } links; + struct uprobe_bpf__rodata { + __u64 func_addr; + __u64 func_addr_test; + char ____uprobe_add_____fmt[40]; + char ____uprobe_add_____fmt_1[33]; + char ____uretprobe_add_____fmt[31]; + } *rodata; +}; + +static void +uprobe_bpf__destroy(struct uprobe_bpf *obj) +{ + if (!obj) + return; + if (obj->skeleton) + bpf_object__destroy_skeleton(obj->skeleton); + free(obj); +} + +static inline int +uprobe_bpf__create_skeleton(struct uprobe_bpf *obj); + +static inline struct uprobe_bpf * +uprobe_bpf__open_opts(const struct bpf_object_open_opts *opts) +{ + struct uprobe_bpf *obj; + + obj = (struct uprobe_bpf *)calloc(1, sizeof(*obj)); + if (!obj) + return NULL; + if (uprobe_bpf__create_skeleton(obj)) + goto err; + if (bpf_object__open_skeleton(obj->skeleton, opts)) + goto err; + + return obj; +err: + uprobe_bpf__destroy(obj); + return NULL; +} + +static inline struct uprobe_bpf * +uprobe_bpf__open(void) +{ + return uprobe_bpf__open_opts(NULL); +} + +static inline int +uprobe_bpf__load(struct uprobe_bpf *obj) +{ + return bpf_object__load_skeleton(obj->skeleton); +} + +static inline struct uprobe_bpf * +uprobe_bpf__open_and_load(void) +{ + struct uprobe_bpf *obj; + + obj = uprobe_bpf__open(); + if (!obj) + return NULL; + if (uprobe_bpf__load(obj)) { + uprobe_bpf__destroy(obj); + return NULL; + } + return obj; +} + +static inline int +uprobe_bpf__attach(struct uprobe_bpf *obj) +{ + return bpf_object__attach_skeleton(obj->skeleton); +} + +static inline void +uprobe_bpf__detach(struct uprobe_bpf *obj) +{ + return bpf_object__detach_skeleton(obj->skeleton); +} + +static inline int +uprobe_bpf__create_skeleton(struct uprobe_bpf *obj) +{ + struct bpf_object_skeleton *s; + + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); + if (!s) + return -1; + obj->skeleton = s; + + s->sz = sizeof(*s); + s->name = "uprobe_bpf"; + s->obj = &obj->obj; + + /* maps */ + s->map_cnt = 1; + s->map_skel_sz = sizeof(*s->maps); + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); + if (!s->maps) + goto err; + + s->maps[0].name = "uprobe_b.rodata"; + s->maps[0].map = &obj->maps.rodata; + s->maps[0].mmaped = (void **)&obj->rodata; + + /* programs */ + s->prog_cnt = 2; + s->prog_skel_sz = sizeof(*s->progs); + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); + if (!s->progs) + goto err; + + s->progs[0].name = "uprobe_add"; + s->progs[0].prog = &obj->progs.uprobe_add; + s->progs[0].link = &obj->links.uprobe_add; + + s->progs[1].name = "uretprobe_add"; + s->progs[1].prog = &obj->progs.uretprobe_add; + s->progs[1].link = &obj->links.uretprobe_add; + + s->data_sz = 8656; + s->data = (void *)"\ +\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x90\x1a\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x1d\0\ +\x01\0\x7b\x1a\xd8\xff\0\0\0\0\x79\xa3\xd8\xff\0\0\0\0\x79\x32\x70\0\0\0\0\0\ +\x79\x31\x68\0\0\0\0\0\x7b\x3a\xf8\xff\0\0\0\0\x63\x2a\xf4\xff\0\0\0\0\x63\x1a\ +\xf0\xff\0\0\0\0\x79\xa5\xf8\xff\0\0\0\0\x61\xa4\xf0\xff\0\0\0\0\x61\xa3\xf4\ +\xff\0\0\0\0\x18\x01\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\xb7\x02\0\0\x28\0\0\0\x85\0\ +\0\0\x06\0\0\0\x7b\x0a\xe8\xff\0\0\0\0\x18\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x79\ +\x13\0\0\0\0\0\0\x18\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x79\x14\0\0\0\0\0\0\x79\ +\xa1\xf8\xff\0\0\0\0\x79\x15\x98\0\0\0\0\0\x18\x01\0\0\x38\0\0\0\0\0\0\0\0\0\0\ +\0\xb7\x02\0\0\x21\0\0\0\x85\0\0\0\x06\0\0\0\x7b\x0a\xe0\xff\0\0\0\0\xb7\0\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\xa2\xe0\xff\0\0\0\0\x79\ +\x21\x50\0\0\0\0\0\x7b\x2a\xf8\xff\0\0\0\0\x63\x1a\xf4\xff\0\0\0\0\x61\xa3\xf4\ +\xff\0\0\0\0\x18\x01\0\0\x59\0\0\0\0\0\0\0\0\0\0\0\xb7\x02\0\0\x1f\0\0\0\x85\0\ +\0\0\x06\0\0\0\x7b\x0a\xe8\xff\0\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x44\ +\x75\x61\x6c\x20\x42\x53\x44\x2f\x47\x50\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x75\x70\x72\x6f\x62\x65\x64\x5f\x61\x64\x64\x20\x45\x4e\x54\x52\x59\x3a\ +\x20\x61\x20\x3d\x20\x25\x64\x2c\x20\x62\x20\x3d\x20\x25\x64\x2c\x20\x25\x6c\ +\x78\x0a\0\x75\x70\x72\x6f\x62\x65\x64\x5f\x61\x64\x64\x20\x45\x4e\x54\x52\x59\ +\x3a\x20\x25\x6c\x78\x20\x25\x6c\x78\x2c\x20\x25\x6c\x78\x0a\0\x75\x70\x72\x6f\ +\x62\x65\x64\x5f\x61\x64\x64\x20\x45\x58\x49\x54\x3a\x20\x72\x65\x74\x75\x72\ +\x6e\x20\x3d\x20\x25\x64\x0a\0\x01\x11\x01\x25\x25\x13\x05\x03\x25\x72\x17\x10\ +\x17\x1b\x25\x11\x01\x55\x23\x73\x17\x74\x17\0\0\x02\x34\0\x03\x25\x49\x13\x3f\ +\x19\x3a\x0b\x3b\x0b\x02\x18\0\0\x03\x01\x01\x49\x13\0\0\x04\x21\0\x49\x13\x37\ +\x0b\0\0\x05\x24\0\x03\x25\x3e\x0b\x0b\x0b\0\0\x06\x24\0\x03\x25\x0b\x0b\x3e\ +\x0b\0\0\x07\x26\0\x49\x13\0\0\x08\x35\0\x49\x13\0\0\x09\x16\0\x49\x13\x03\x25\ +\x3a\x0b\x3b\x0b\0\0\x0a\x2e\x01\0\0\x0b\x34\0\x03\x25\x49\x13\x3a\x0b\x3b\x0b\ +\x02\x18\0\0\x0c\x34\0\x03\x25\x49\x13\x3a\x0b\x3b\x0b\0\0\x0d\x0f\0\x49\x13\0\ +\0\x0e\x15\x01\x49\x13\x27\x19\0\0\x0f\x05\0\x49\x13\0\0\x10\x18\0\0\0\x11\x2e\ +\x01\x03\x25\x3a\x0b\x3b\x0b\x27\x19\x49\x13\x20\x21\x01\0\0\x12\x05\0\x03\x25\ +\x3a\x0b\x3b\x0b\x49\x13\0\0\x13\x13\x01\x03\x25\x0b\x0b\x3a\x0b\x3b\x0b\0\0\ +\x14\x0d\0\x03\x25\x49\x13\x3a\x0b\x3b\x0b\x38\x0b\0\0\x15\x2e\x01\x11\x1b\x12\ +\x06\x40\x18\x03\x25\x3a\x0b\x3b\x0b\x27\x19\x49\x13\x3f\x19\0\0\x16\x05\0\x02\ +\x18\x03\x25\x3a\x0b\x3b\x0b\x49\x13\0\0\x17\x1d\x01\x31\x13\x11\x1b\x12\x06\ +\x58\x0b\x59\x0b\x57\x0b\0\0\x18\x05\0\x02\x18\x31\x13\0\0\0\x7d\x02\0\0\x05\0\ +\x01\x08\0\0\0\0\x01\0\x1d\0\x01\x08\0\0\0\0\0\0\0\x02\0\0\0\0\0\0\0\0\0\x08\0\ +\0\0\x0c\0\0\0\x02\x03\x36\0\0\0\0\x07\x02\xa1\0\x03\x42\0\0\0\x04\x46\0\0\0\ +\x0d\0\x05\x04\x06\x01\x06\x05\x08\x07\x02\x06\x55\0\0\0\0\x08\x02\xa1\x01\x07\ +\x5a\0\0\0\x08\x5f\0\0\0\x09\x67\0\0\0\x08\x01\x1f\x05\x07\x07\x08\x02\x09\x55\ +\0\0\0\0\x09\x02\xa1\x02\x0a\x0b\x0a\x8e\0\0\0\0\x0e\x02\xa1\x03\x0b\x0a\xd7\0\ +\0\0\0\x0f\x02\xa1\x04\0\x03\x9a\0\0\0\x04\x46\0\0\0\x28\0\x07\x42\0\0\0\x0c\ +\x0b\xa7\0\0\0\x02\xb9\x07\xac\0\0\0\x0d\xb1\0\0\0\x0e\xc2\0\0\0\x0f\xc6\0\0\0\ +\x0f\xcb\0\0\0\x10\0\x05\x0c\x05\x08\x0d\x9a\0\0\0\x09\xd3\0\0\0\x0e\x01\x1b\ +\x05\x0d\x07\x04\x03\x9a\0\0\0\x04\x46\0\0\0\x21\0\x0a\x0b\x0a\xf0\0\0\0\0\x16\ +\x02\xa1\x05\0\x03\x9a\0\0\0\x04\x46\0\0\0\x1f\0\x11\x0f\0\x0b\x1d\x01\0\0\x12\ +\x11\0\x0b\x21\x01\0\0\x12\x29\0\x0b\x1d\x01\0\0\x12\x2a\0\x0b\x1d\x01\0\0\0\ +\x05\x10\x05\x04\x0d\x26\x01\0\0\x13\x28\xa8\x03\x29\x14\x12\xe9\x01\0\0\x03\ +\x2e\0\x14\x14\xe9\x01\0\0\x03\x2f\x08\x14\x15\xe9\x01\0\0\x03\x30\x10\x14\x16\ +\xe9\x01\0\0\x03\x31\x18\x14\x17\xe9\x01\0\0\x03\x32\x20\x14\x18\xe9\x01\0\0\ +\x03\x33\x28\x14\x19\xe9\x01\0\0\x03\x35\x30\x14\x1a\xe9\x01\0\0\x03\x36\x38\ +\x14\x1b\xe9\x01\0\0\x03\x37\x40\x14\x1c\xe9\x01\0\0\x03\x38\x48\x14\x1d\xe9\ +\x01\0\0\x03\x39\x50\x14\x1e\xe9\x01\0\0\x03\x3a\x58\x14\x1f\xe9\x01\0\0\x03\ +\x3b\x60\x14\x20\xe9\x01\0\0\x03\x3c\x68\x14\x21\xe9\x01\0\0\x03\x3d\x70\x14\ +\x22\xe9\x01\0\0\x03\x42\x78\x14\x23\xe9\x01\0\0\x03\x44\x80\x14\x24\xe9\x01\0\ +\0\x03\x45\x88\x14\x25\xe9\x01\0\0\x03\x46\x90\x14\x26\xe9\x01\0\0\x03\x47\x98\ +\x14\x27\xe9\x01\0\0\x03\x48\xa0\0\x05\x13\x07\x08\x15\x06\xf0\0\0\0\x01\x5a\ +\x2d\0\x0b\x1d\x01\0\0\x16\x02\x91\0\x11\0\x0b\x21\x01\0\0\x17\xfc\0\0\0\x07\ +\xb0\0\0\0\0\x0b\x05\x18\x02\x91\x20\x04\x01\0\0\x18\x02\x91\x1c\x0c\x01\0\0\ +\x18\x02\x91\x18\x14\x01\0\0\0\0\x11\x2b\0\x14\x1d\x01\0\0\x12\x11\0\x14\x21\ +\x01\0\0\x12\x2c\0\x14\x1d\x01\0\0\0\x15\x08\x68\0\0\0\x01\x5a\x2e\0\x14\x1d\ +\x01\0\0\x16\x02\x91\0\x11\0\x14\x21\x01\0\0\x17\x2e\x02\0\0\x09\x38\0\0\0\0\ +\x14\x05\x18\x02\x91\x18\x36\x02\0\0\x18\x02\x91\x14\x3e\x02\0\0\0\0\0\x14\0\0\ +\0\x05\0\x08\0\x01\0\0\0\x04\0\0\0\x03\x06\xf0\x01\x03\x08\x68\0\xc0\0\0\0\x05\ +\0\0\0\0\0\0\0\x45\0\0\0\x52\0\0\0\x86\0\0\0\x8e\0\0\0\x93\0\0\0\xa7\0\0\0\xb1\ +\0\0\0\xc4\0\0\0\xca\0\0\0\xd9\0\0\0\xe1\0\0\0\xf2\0\0\0\xf7\0\0\0\x04\x01\0\0\ +\x0a\x01\0\0\x19\x01\0\0\x1d\x01\0\0\x21\x01\0\0\x25\x01\0\0\x33\x01\0\0\x37\ +\x01\0\0\x3b\x01\0\0\x3f\x01\0\0\x43\x01\0\0\x47\x01\0\0\x4b\x01\0\0\x4f\x01\0\ +\0\x52\x01\0\0\x55\x01\0\0\x59\x01\0\0\x5d\x01\0\0\x61\x01\0\0\x65\x01\0\0\x69\ +\x01\0\0\x72\x01\0\0\x76\x01\0\0\x79\x01\0\0\x80\x01\0\0\x84\x01\0\0\x87\x01\0\ +\0\x8f\x01\0\0\x91\x01\0\0\x93\x01\0\0\xa5\x01\0\0\xa9\x01\0\0\xb4\x01\0\0\x63\ +\x6c\x61\x6e\x67\x20\x76\x65\x72\x73\x69\x6f\x6e\x20\x31\x38\x2e\x31\x2e\x38\ +\x20\x28\x52\x65\x64\x20\x48\x61\x74\x20\x31\x38\x2e\x31\x2e\x38\x2d\x31\x2e\ +\x6d\x6f\x64\x75\x6c\x65\x2b\x65\x6c\x38\x2e\x31\x30\x2e\x30\x2b\x37\x30\x33\ +\x2b\x65\x63\x37\x62\x33\x33\x62\x61\x29\0\x75\x70\x72\x6f\x62\x65\x2e\x62\x70\ +\x66\x2e\x63\0\x2f\x75\x73\x72\x2f\x6c\x69\x62\x2f\x74\x65\x6e\x63\x65\x6e\x74\ +\x6f\x73\x2d\x74\x6f\x6f\x6c\x73\x2f\x6f\x70\x73\x2f\x6f\x73\x5f\x73\x74\x61\ +\x74\x2f\x6f\x73\x5f\x73\x74\x61\x74\x5f\x75\x70\x72\x6f\x62\x65\0\x4c\x49\x43\ +\x45\x4e\x53\x45\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\ +\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\0\ +\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x5f\ +\x5f\x75\x36\x34\0\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\x5f\x74\x65\x73\x74\0\ +\x5f\x5f\x5f\x5f\x66\x6d\x74\0\x62\x70\x66\x5f\x74\x72\x61\x63\x65\x5f\x70\x72\ +\x69\x6e\x74\x6b\0\x6c\x6f\x6e\x67\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\ +\x6e\x74\0\x5f\x5f\x75\x33\x32\0\x5f\x5f\x5f\x5f\x75\x70\x72\x6f\x62\x65\x5f\ +\x61\x64\x64\0\x69\x6e\x74\0\x63\x74\x78\0\x72\x31\x35\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x6c\x6f\x6e\x67\0\x72\x31\x34\0\x72\x31\x33\0\x72\x31\x32\0\ +\x72\x62\x70\0\x72\x62\x78\0\x72\x31\x31\0\x72\x31\x30\0\x72\x39\0\x72\x38\0\ +\x72\x61\x78\0\x72\x63\x78\0\x72\x64\x78\0\x72\x73\x69\0\x72\x64\x69\0\x6f\x72\ +\x69\x67\x5f\x72\x61\x78\0\x72\x69\x70\0\x63\x73\0\x65\x66\x6c\x61\x67\x73\0\ +\x72\x73\x70\0\x73\x73\0\x70\x74\x5f\x72\x65\x67\x73\0\x61\0\x62\0\x5f\x5f\x5f\ +\x5f\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\0\x72\x65\x74\0\x75\ +\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\0\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\ +\x61\x64\x64\0\x54\0\0\0\x05\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x10\0\0\0\0\0\0\0\x38\0\0\0\0\0\0\0\x59\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x38\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x28\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\x18\0\0\0\ +\0\0\0\0\xf4\x02\0\0\xf4\x02\0\0\xb6\x02\0\0\0\0\0\0\x01\0\0\x0d\x02\0\0\0\0\0\ +\0\0\x03\0\0\0\x01\0\0\0\0\0\0\x01\x04\0\0\0\x20\0\0\x01\0\0\0\0\0\0\0\x02\x04\ +\0\0\0\x05\0\0\0\x15\0\0\x04\xa8\0\0\0\x0d\0\0\0\x05\0\0\0\0\0\0\0\x11\0\0\0\ +\x05\0\0\0\x40\0\0\0\x15\0\0\0\x05\0\0\0\x80\0\0\0\x19\0\0\0\x05\0\0\0\xc0\0\0\ +\0\x1d\0\0\0\x05\0\0\0\0\x01\0\0\x21\0\0\0\x05\0\0\0\x40\x01\0\0\x25\0\0\0\x05\ +\0\0\0\x80\x01\0\0\x29\0\0\0\x05\0\0\0\xc0\x01\0\0\x2d\0\0\0\x05\0\0\0\0\x02\0\ +\0\x30\0\0\0\x05\0\0\0\x40\x02\0\0\x33\0\0\0\x05\0\0\0\x80\x02\0\0\x37\0\0\0\ +\x05\0\0\0\xc0\x02\0\0\x3b\0\0\0\x05\0\0\0\0\x03\0\0\x3f\0\0\0\x05\0\0\0\x40\ +\x03\0\0\x43\0\0\0\x05\0\0\0\x80\x03\0\0\x47\0\0\0\x05\0\0\0\xc0\x03\0\0\x50\0\ +\0\0\x05\0\0\0\0\x04\0\0\x54\0\0\0\x05\0\0\0\x40\x04\0\0\x57\0\0\0\x05\0\0\0\ +\x80\x04\0\0\x5e\0\0\0\x05\0\0\0\xc0\x04\0\0\x62\0\0\0\x05\0\0\0\0\x05\0\0\x65\ +\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x73\0\0\0\x01\0\0\x0c\x01\0\0\0\0\0\0\0\ +\x01\0\0\x0d\x02\0\0\0\0\0\0\0\x03\0\0\0\x92\x01\0\0\x01\0\0\x0c\x07\0\0\0\x09\ +\x02\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\0\0\x03\0\0\0\0\x09\0\0\0\ +\x0b\0\0\0\x0d\0\0\0\x0e\x02\0\0\0\0\0\x01\x04\0\0\0\x20\0\0\0\x22\x02\0\0\0\0\ +\0\x0e\x0a\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\x0a\x0e\0\0\0\0\0\0\0\0\0\0\x09\x0f\0\ +\0\0\x2a\x02\0\0\0\0\0\x08\x10\0\0\0\x30\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\ +\x43\x02\0\0\0\0\0\x0e\x0d\0\0\0\x01\0\0\0\x4d\x02\0\0\0\0\0\x0e\x0d\0\0\0\x01\ +\0\0\0\0\0\0\0\0\0\0\x0a\x09\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x13\0\0\0\x0b\0\0\ +\0\x28\0\0\0\x5c\x02\0\0\0\0\0\x0e\x14\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ +\x13\0\0\0\x0b\0\0\0\x21\0\0\0\x73\x02\0\0\0\0\0\x0e\x16\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\x13\0\0\0\x0b\0\0\0\x1f\0\0\0\x8c\x02\0\0\0\0\0\x0e\x18\0\0\ +\0\0\0\0\0\xa6\x02\0\0\x05\0\0\x0f\0\0\0\0\x11\0\0\0\0\0\0\0\x08\0\0\0\x12\0\0\ +\0\0\0\0\0\x08\0\0\0\x15\0\0\0\x10\0\0\0\x28\0\0\0\x17\0\0\0\x38\0\0\0\x21\0\0\ +\0\x19\0\0\0\x59\0\0\0\x1f\0\0\0\xae\x02\0\0\x01\0\0\x0f\0\0\0\0\x0c\0\0\0\0\0\ +\0\0\x0d\0\0\0\0\x69\x6e\x74\0\x70\x74\x5f\x72\x65\x67\x73\0\x72\x31\x35\0\x72\ +\x31\x34\0\x72\x31\x33\0\x72\x31\x32\0\x72\x62\x70\0\x72\x62\x78\0\x72\x31\x31\ +\0\x72\x31\x30\0\x72\x39\0\x72\x38\0\x72\x61\x78\0\x72\x63\x78\0\x72\x64\x78\0\ +\x72\x73\x69\0\x72\x64\x69\0\x6f\x72\x69\x67\x5f\x72\x61\x78\0\x72\x69\x70\0\ +\x63\x73\0\x65\x66\x6c\x61\x67\x73\0\x72\x73\x70\0\x73\x73\0\x75\x6e\x73\x69\ +\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\0\x75\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\ +\0\x75\x70\x72\x6f\x62\x65\0\x2f\x75\x73\x72\x2f\x6c\x69\x62\x2f\x74\x65\x6e\ +\x63\x65\x6e\x74\x6f\x73\x2d\x74\x6f\x6f\x6c\x73\x2f\x6f\x70\x73\x2f\x6f\x73\ +\x5f\x73\x74\x61\x74\x2f\x6f\x73\x5f\x73\x74\x61\x74\x5f\x75\x70\x72\x6f\x62\ +\x65\x2f\x75\x70\x72\x6f\x62\x65\x2e\x62\x70\x66\x2e\x63\0\x69\x6e\x74\x20\x42\ +\x50\x46\x5f\x4b\x50\x52\x4f\x42\x45\x28\x75\x70\x72\x6f\x62\x65\x5f\x61\x64\ +\x64\x2c\x20\x69\x6e\x74\x20\x61\x2c\x20\x69\x6e\x74\x20\x62\x29\0\x09\x62\x70\ +\x66\x5f\x70\x72\x69\x6e\x74\x6b\x28\x22\x75\x70\x72\x6f\x62\x65\x64\x5f\x61\ +\x64\x64\x20\x45\x4e\x54\x52\x59\x3a\x20\x61\x20\x3d\x20\x25\x64\x2c\x20\x62\ +\x20\x3d\x20\x25\x64\x2c\x20\x25\x6c\x78\x5c\x6e\x22\x2c\x20\x61\x2c\x20\x62\ +\x2c\x20\x63\x74\x78\x29\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x69\x6e\x74\x6b\x28\ +\x22\x75\x70\x72\x6f\x62\x65\x64\x5f\x61\x64\x64\x20\x45\x4e\x54\x52\x59\x3a\ +\x20\x25\x6c\x78\x20\x25\x6c\x78\x2c\x20\x25\x6c\x78\x5c\x6e\x22\x2c\x20\x66\ +\x75\x6e\x63\x5f\x61\x64\x64\x72\x2c\x20\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\ +\x5f\x74\x65\x73\x74\x2c\x20\x50\x54\x5f\x52\x45\x47\x53\x5f\x52\x45\x54\x28\ +\x63\x74\x78\x29\x29\x3b\0\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\ +\0\x75\x72\x65\x74\x70\x72\x6f\x62\x65\0\x69\x6e\x74\x20\x42\x50\x46\x5f\x4b\ +\x52\x45\x54\x50\x52\x4f\x42\x45\x28\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\ +\x61\x64\x64\x2c\x20\x69\x6e\x74\x20\x72\x65\x74\x29\0\x09\x62\x70\x66\x5f\x70\ +\x72\x69\x6e\x74\x6b\x28\x22\x75\x70\x72\x6f\x62\x65\x64\x5f\x61\x64\x64\x20\ +\x45\x58\x49\x54\x3a\x20\x72\x65\x74\x75\x72\x6e\x20\x3d\x20\x25\x64\x5c\x6e\ +\x22\x2c\x20\x72\x65\x74\x29\x3b\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\ +\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x4c\x49\x43\x45\x4e\x53\ +\x45\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\ +\x67\x20\x6c\x6f\x6e\x67\0\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\0\x66\x75\x6e\ +\x63\x5f\x61\x64\x64\x72\x5f\x74\x65\x73\x74\0\x5f\x5f\x5f\x5f\x75\x70\x72\x6f\ +\x62\x65\x5f\x61\x64\x64\x2e\x5f\x5f\x5f\x5f\x66\x6d\x74\0\x5f\x5f\x5f\x5f\x75\ +\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\x2e\x5f\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\ +\x5f\x5f\x5f\x5f\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\x2e\x5f\ +\x5f\x5f\x5f\x66\x6d\x74\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\ +\x73\x65\0\0\0\x9f\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\xa4\0\0\0\ +\xc8\0\0\0\0\0\0\0\x08\0\0\0\x7e\0\0\0\x01\0\0\0\0\0\0\0\x06\0\0\0\xa0\x01\0\0\ +\x01\0\0\0\0\0\0\0\x08\0\0\0\x10\0\0\0\x7e\0\0\0\x05\0\0\0\0\0\0\0\x85\0\0\0\ +\xc6\0\0\0\0\x2c\0\0\x08\0\0\0\x85\0\0\0\xc6\0\0\0\x05\x2c\0\0\x38\0\0\0\x85\0\ +\0\0\xef\0\0\0\x02\x38\0\0\x78\0\0\0\x85\0\0\0\x33\x01\0\0\x02\x3c\0\0\xe8\0\0\ +\0\x85\0\0\0\xc6\0\0\0\x05\x2c\0\0\xa0\x01\0\0\x04\0\0\0\0\0\0\0\x85\0\0\0\xaa\ +\x01\0\0\0\x50\0\0\x08\0\0\0\x85\0\0\0\xaa\x01\0\0\x05\x50\0\0\x28\0\0\0\x85\0\ +\0\0\xd4\x01\0\0\x02\x58\0\0\x60\0\0\0\x85\0\0\0\xaa\x01\0\0\x05\x50\0\0\x0c\0\ +\0\0\xff\xff\xff\xff\x04\0\x08\0\x08\x7c\x0b\0\x14\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\xf0\0\0\0\0\0\0\0\x14\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x68\0\0\0\0\0\0\0\xc1\0\ +\0\0\x05\0\x08\0\x7a\0\0\0\x08\x01\x01\xfb\x0e\x0d\0\x01\x01\x01\x01\0\0\0\x01\ +\0\0\x01\x01\x01\x1f\x02\0\0\0\0\x34\0\0\0\x03\x01\x1f\x02\x0f\x05\x1e\x04\x39\ +\0\0\0\0\x0c\x95\x02\x15\x86\x1a\x0b\x73\xa6\x04\x4b\x26\x43\xcd\x06\x90\x46\0\ +\0\0\x01\xb8\x10\xf2\x70\x73\x3e\x10\x63\x19\xb6\x7e\xf5\x12\xc6\x24\x6e\x65\0\ +\0\0\x01\xa5\xa8\xa4\xf9\x34\xaa\x57\x11\xde\xc2\x3f\xec\x64\x5c\x40\x01\x83\0\ +\0\0\x01\xd0\x91\xe8\x97\x97\x2e\0\x58\xad\x5c\xa6\x2f\x6e\x36\x6c\x3e\x04\0\0\ +\x09\x02\0\0\0\0\0\0\0\0\x03\x0a\x01\x05\x05\x0a\x20\x05\x02\x69\x83\x05\x05\ +\xd2\x02\x01\0\x01\x01\x04\0\0\x09\x02\0\0\0\0\0\0\0\0\x03\x13\x01\x05\x05\x0a\ +\x20\x05\x02\x4c\x05\x05\x72\x02\x01\0\x01\x01\x2f\x75\x73\x72\x2f\x6c\x69\x62\ +\x2f\x74\x65\x6e\x63\x65\x6e\x74\x6f\x73\x2d\x74\x6f\x6f\x6c\x73\x2f\x6f\x70\ +\x73\x2f\x6f\x73\x5f\x73\x74\x61\x74\x2f\x6f\x73\x5f\x73\x74\x61\x74\x5f\x75\ +\x70\x72\x6f\x62\x65\0\x2f\x75\x73\x72\0\x75\x70\x72\x6f\x62\x65\x2e\x62\x70\ +\x66\x2e\x63\0\x69\x6e\x63\x6c\x75\x64\x65\x2f\x61\x73\x6d\x2d\x67\x65\x6e\x65\ +\x72\x69\x63\x2f\x69\x6e\x74\x2d\x6c\x6c\x36\x34\x2e\x68\0\x69\x6e\x63\x6c\x75\ +\x64\x65\x2f\x62\x70\x66\x2f\x62\x70\x66\x5f\x68\x65\x6c\x70\x65\x72\x5f\x64\ +\x65\x66\x73\x2e\x68\0\x69\x6e\x63\x6c\x75\x64\x65\x2f\x61\x73\x6d\x2f\x70\x74\ +\x72\x61\x63\x65\x2e\x68\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x37\x01\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x31\0\0\0\x01\0\x08\0\x10\0\0\0\0\0\0\0\x28\ +\0\0\0\0\0\0\0\x6d\x01\0\0\x01\0\x08\0\x38\0\0\0\0\0\0\0\x21\0\0\0\0\0\0\0\0\0\ +\0\0\x03\0\x05\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\0\0\0\x01\0\x08\0\x59\0\0\ +\0\0\0\0\0\x1f\0\0\0\0\0\0\0\0\0\0\0\x03\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x0c\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x0d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\x0f\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x10\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\x18\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\x1a\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x1e\x01\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\xf0\0\0\0\ +\0\0\0\0\xb4\0\0\0\x11\0\x08\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\x22\0\0\0\x11\ +\0\x08\0\x08\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\x29\x01\0\0\x12\0\x05\0\0\0\0\0\0\ +\0\0\0\x68\0\0\0\0\0\0\0\x65\x01\0\0\x11\0\x07\0\0\0\0\0\0\0\0\0\x0d\0\0\0\0\0\ +\0\0\x50\0\0\0\0\0\0\0\x01\0\0\0\x07\0\0\0\x78\0\0\0\0\0\0\0\x01\0\0\0\x11\0\0\ +\0\x90\0\0\0\0\0\0\0\x01\0\0\0\x12\0\0\0\xb8\0\0\0\0\0\0\0\x01\0\0\0\x07\0\0\0\ +\x30\0\0\0\0\0\0\0\x01\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x03\0\0\0\x08\0\0\0\ +\x11\0\0\0\0\0\0\0\x03\0\0\0\x0a\0\0\0\x15\0\0\0\0\0\0\0\x03\0\0\0\x0e\0\0\0\ +\x23\0\0\0\0\0\0\0\x03\0\0\0\x0c\0\0\0\x27\0\0\0\0\0\0\0\x03\0\0\0\x09\0\0\0\ +\x08\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x0c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x10\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x14\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x18\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x1c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x20\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x24\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x28\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x2c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x30\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x34\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x38\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x40\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x44\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x48\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x4c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x50\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x54\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x58\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x5c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x60\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x64\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x68\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x6c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x70\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x74\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x78\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x7c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x80\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x84\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x88\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x8c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x90\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x94\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\x98\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x9c\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\xa0\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\xa4\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\xa8\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\xac\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\xb0\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\xb4\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\xb8\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\xbc\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\ +\xc0\0\0\0\0\0\0\0\x03\0\0\0\x0b\0\0\0\x08\0\0\0\0\0\0\0\x02\0\0\0\x14\0\0\0\ +\x10\0\0\0\0\0\0\0\x02\0\0\0\x11\0\0\0\x18\0\0\0\0\0\0\0\x02\0\0\0\x12\0\0\0\ +\x20\0\0\0\0\0\0\0\x02\0\0\0\x07\0\0\0\x28\0\0\0\0\0\0\0\x02\0\0\0\x07\0\0\0\ +\x30\0\0\0\0\0\0\0\x02\0\0\0\x07\0\0\0\x38\0\0\0\0\0\0\0\x02\0\0\0\x02\0\0\0\ +\x40\0\0\0\0\0\0\0\x02\0\0\0\x02\0\0\0\x48\0\0\0\0\0\0\0\x02\0\0\0\x05\0\0\0\ +\x50\0\0\0\0\0\0\0\x02\0\0\0\x05\0\0\0\xbc\x02\0\0\0\0\0\0\x03\0\0\0\x11\0\0\0\ +\xc8\x02\0\0\0\0\0\0\x03\0\0\0\x12\0\0\0\xd4\x02\0\0\0\0\0\0\x03\0\0\0\x07\0\0\ +\0\xe0\x02\0\0\0\0\0\0\x03\0\0\0\x07\0\0\0\xec\x02\0\0\0\0\0\0\x03\0\0\0\x07\0\ +\0\0\x04\x03\0\0\0\0\0\0\x04\0\0\0\x14\0\0\0\x2c\0\0\0\0\0\0\0\x04\0\0\0\x02\0\ +\0\0\x3c\0\0\0\0\0\0\0\x04\0\0\0\x05\0\0\0\x50\0\0\0\0\0\0\0\x04\0\0\0\x02\0\0\ +\0\x60\0\0\0\0\0\0\0\x04\0\0\0\x02\0\0\0\x70\0\0\0\0\0\0\0\x04\0\0\0\x02\0\0\0\ +\x80\0\0\0\0\0\0\0\x04\0\0\0\x02\0\0\0\x90\0\0\0\0\0\0\0\x04\0\0\0\x02\0\0\0\ +\xa8\0\0\0\0\0\0\0\x04\0\0\0\x05\0\0\0\xb8\0\0\0\0\0\0\0\x04\0\0\0\x05\0\0\0\ +\xc8\0\0\0\0\0\0\0\x04\0\0\0\x05\0\0\0\xd8\0\0\0\0\0\0\0\x04\0\0\0\x05\0\0\0\ +\x14\0\0\0\0\0\0\0\x03\0\0\0\x0d\0\0\0\x18\0\0\0\0\0\0\0\x02\0\0\0\x02\0\0\0\ +\x2c\0\0\0\0\0\0\0\x03\0\0\0\x0d\0\0\0\x30\0\0\0\0\0\0\0\x02\0\0\0\x05\0\0\0\ +\x22\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\x26\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\ +\x32\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\x47\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\ +\x5c\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\x71\0\0\0\0\0\0\0\x03\0\0\0\x0f\0\0\0\ +\x8b\0\0\0\0\0\0\0\x02\0\0\0\x02\0\0\0\xab\0\0\0\0\0\0\0\x02\0\0\0\x05\0\0\0\ +\x10\x13\x14\x11\x12\x03\x04\x06\0\x2e\x64\x65\x62\x75\x67\x5f\x61\x62\x62\x72\ +\x65\x76\0\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\ +\x74\0\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\x5f\x74\x65\x73\x74\0\x5f\x5f\x5f\ +\x5f\x75\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\x2e\x5f\x5f\x5f\x5f\x66\x6d\x74\0\ +\x5f\x5f\x5f\x5f\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\x2e\x5f\ +\x5f\x5f\x5f\x66\x6d\x74\0\x2e\x64\x65\x62\x75\x67\x5f\x72\x6e\x67\x6c\x69\x73\ +\x74\x73\0\x2e\x72\x65\x6c\x2e\x64\x65\x62\x75\x67\x5f\x73\x74\x72\x5f\x6f\x66\ +\x66\x73\x65\x74\x73\0\x2e\x64\x65\x62\x75\x67\x5f\x73\x74\x72\0\x2e\x64\x65\ +\x62\x75\x67\x5f\x6c\x69\x6e\x65\x5f\x73\x74\x72\0\x2e\x72\x65\x6c\x2e\x64\x65\ +\x62\x75\x67\x5f\x61\x64\x64\x72\0\x66\x75\x6e\x63\x5f\x61\x64\x64\x72\0\x2e\ +\x72\x65\x6c\x2e\x64\x65\x62\x75\x67\x5f\x69\x6e\x66\x6f\0\x2e\x6c\x6c\x76\x6d\ +\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\x6e\x73\x65\0\x2e\x72\x65\ +\x6c\x2e\x64\x65\x62\x75\x67\x5f\x6c\x69\x6e\x65\0\x2e\x72\x65\x6c\x2e\x64\x65\ +\x62\x75\x67\x5f\x66\x72\x61\x6d\x65\0\x2e\x72\x65\x6c\x75\x70\x72\x6f\x62\x65\ +\0\x2e\x72\x65\x6c\x75\x72\x65\x74\x70\x72\x6f\x62\x65\0\x75\x70\x72\x6f\x62\ +\x65\x5f\x61\x64\x64\0\x75\x72\x65\x74\x70\x72\x6f\x62\x65\x5f\x61\x64\x64\0\ +\x75\x70\x72\x6f\x62\x65\x2e\x62\x70\x66\x2e\x63\0\x2e\x73\x74\x72\x74\x61\x62\ +\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\x61\x74\x61\0\x2e\x72\x65\x6c\ +\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\x5f\x5f\x5f\x5f\x75\x70\x72\ +\x6f\x62\x65\x5f\x61\x64\x64\x2e\x5f\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x44\x01\0\0\x03\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x08\x19\0\0\0\0\0\0\x86\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x09\x01\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\ +\0\0\0\0\0\0\xf0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x05\x01\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\ +\x40\0\0\0\0\0\0\0\x1c\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x14\ +\x01\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\x01\0\0\0\0\0\0\x68\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\x01\0\0\x09\ +\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x1c\0\0\0\x05\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\xdc\0\0\0\x01\0\0\0\ +\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0d\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x54\x01\0\0\x01\0\0\0\x02\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xa8\x01\0\0\0\0\0\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x20\x02\0\0\0\0\0\0\x24\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x44\ +\x03\0\0\0\0\0\0\x81\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\xbe\0\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x50\x13\0\0\0\0\ +\0\0\x50\0\0\0\0\0\0\0\x1c\0\0\0\x0a\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x62\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc5\x05\0\0\0\0\0\0\x18\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x76\0\0\0\x01\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xdd\x05\0\0\0\0\0\0\xc4\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x72\0\0\0\x09\0\0\0\x40\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xa0\x13\0\0\0\0\0\0\xf0\x02\0\0\0\0\0\0\x1c\0\0\0\x0d\0\ +\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x89\0\0\0\x01\0\0\0\x30\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\xa1\x06\0\0\0\0\0\0\xc2\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\ +\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\xa8\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x63\x08\0\0\0\0\0\0\x58\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\xa4\0\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x90\x16\0\ +\0\0\0\0\0\xa0\0\0\0\0\0\0\0\x1c\0\0\0\x10\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\ +\0\0\0\x60\x01\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbc\x08\0\0\0\0\0\ +\0\xc2\x05\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x5c\ +\x01\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\x17\0\0\0\0\0\0\x60\0\ +\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x19\0\0\0\ +\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\0\0\0\0\xe8\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x15\0\0\0\x09\0\0\0\x40\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\0\0\0\0\xb0\0\0\0\0\0\0\0\x1c\0\0\0\ +\x14\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\xf8\0\0\0\x01\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\x68\x0f\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf4\0\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x40\x18\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\x1c\0\0\0\x16\0\0\0\x08\0\0\0\0\0\ +\0\0\x10\0\0\0\0\0\0\0\xe8\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa8\ +\x0f\0\0\0\0\0\0\xc5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\xe4\0\0\0\x09\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x80\x18\0\0\0\0\0\ +\0\x80\0\0\0\0\0\0\0\x1c\0\0\0\x18\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x94\0\0\0\x01\0\0\0\x30\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\x10\0\0\0\0\0\0\x98\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\xce\0\0\0\ +\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\0\0\0\x08\0\0\0\ +\0\0\0\0\x1c\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4c\x01\0\0\x02\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\x11\0\0\0\0\0\0\xf8\x01\0\0\0\0\0\0\ +\x01\0\0\0\x10\0\0\0\x08\0\0\0\0\0\0\0\x18\0\0\0\0\0\0\0"; + + return 0; +err: + bpf_object__destroy_skeleton(s); + return -1; +} + +#endif /* __UPROBE_BPF_SKEL_H__ */ diff --git a/ops/os_stat/os_stat_uprobe/uprobe_trace.sh b/ops/os_stat/os_stat_uprobe/uprobe_trace.sh new file mode 100755 index 0000000000000000000000000000000000000000..5596e4e8187513ce45c303c2b035fb445a6aabdc --- /dev/null +++ b/ops/os_stat/os_stat_uprobe/uprobe_trace.sh @@ -0,0 +1,104 @@ +#!/bin/sh + +function get_offset() +{ + ./get_offset.py $1 $2 +} +function helper() +{ + echo -e "\033[32m -f, --function: \"which scan from this function\" \033[0m" + echo -e "\033[32m -fg, --flags: 0:test hot path, 1: test one function, ..., 6:test user function;\033[0m" + echo -e "\033[32m -a, --parameter: show parameter name in struct .etc\033[0m" + echo -e "\033[32m -p, --proc: only track the process assigned by proc, default: no proc assigned\033[0m" + echo -e "\033[32m -e, --exec: -e: pin with path\033[0m" + echo -e "\033[32m -h, --help: help information\033[0m" + echo such as: + echo -e "\033[32m 1.trace (a):process pid:'2416976', (b):user function:'uprobed_add', (c):user bin:'./uprobe' which contains function \033[0m" + echo -e "\033[32m (1)t-ops os_stat -fg 6 -a 1 -p 2416976 -f uprobed_add -e ./uprobe\033[0m" +} +#main +function main() +{ + PARAMETER={} + STRUCT={} + POINTER={} + FUNCTION="" + TYPE="" + PROC="" + DELAY=0 + i=0 + j=0 + k=0 + m=0 + order=0 + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -a|--parameter) + PARAMETER[i]="$2" + i=$(($i+1)) + shift # past argument + shift # past value + ;; + -p|--proc) + PROC="$2" + shift # past argument + shift # past value + ;; + -f|--function) + FUNCTION="$2" + shift # past argument + shift # past value + ;; + -e|--exec) + EXEC="$2" + shift # past argument + shift # past value + ;; + -h|--help) + helper + return + shift # past argument + ;; + *) + POSITIONAL+=("$1") # save it in an array for later + shift # past argument + ;; + esac + done + + cd /usr/lib/tencentos-tools/ops/os_stat/os_stat_uprobe/ + offset=0 + if [ ! -f "/usr/lib/tencentos-tools/ops/os_stat/os_stat_uprobe/uprobe" ];then + if [ ! -d "/usr/include/bpf" ];then + printf "\033[31m libbpf is not exist\n\033[0m" + printf "\033[32m git clone --depth 1 https://github.com/libbpf/libbpf\n\033[0m" + printf "\033[32m cd libbpf/src/; make install\n\033[0m" + printf "\033[32m if error, add in libbpf.c\n\033[0m" + exit + fi + bpftool_ret=$(rpm -qa | grep "bpftool") + if [ -z "$bpftool_ret" ]; then + yum install bpftool + fi + + echo ------make new---------------- + clang -g -o2 -target bpf -D__TARGET_ARCH_x86 -D__TARGET_ARCH_x86_64 -I /usr/include/x86_64-linux-gnu -I. -I ./libbpf/src -c uprobe.bpf.c -o uprobe.bpf.o + + bpftool gen skeleton uprobe.bpf.o > uprobe.skel.h + clang -Wall -g -I. -c uprobe.c -o uprobe.o + clang -Wall -g uprobe.o -L/usr/lib64 -lbpf -lelf -lz -o uprobe + + fi + if [ ! -z "$FUNCTION" ] && [ ! -z "$EXEC" ]; then + objdump -S $EXEC | grep -w $FUNCTION | grep "<" | grep : > data.txt + offset=$(get_offset data.txt $FUNCTION) + cat data.txt + echo $offset + fi + ./uprobe ${PARAMETER[0]} $PROC "0x"$offset + + cd - +} + +main $* diff --git a/ops/os_stat/os_stat_user/Makefile b/ops/os_stat/os_stat_user/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..248acd51402db666b544c4160fef485df4583cd3 --- /dev/null +++ b/ops/os_stat/os_stat_user/Makefile @@ -0,0 +1,30 @@ +#Makefile + +os_stat_blongm : main.o do_adjust.o + cc -o os_stat_blongm main.o do_adjust.o + +os_stat_data : data_only.o do_adjust.o + cc -o os_stat_data data_only.o do_adjust.o + +os_stat_test : test.o + cc -o os_stat_test test.o + +os_stat_scene_nohook : main_scene.o do_adjust.o + cc -o os_stat_scene_nohook main_scene.o do_adjust.o + +main.o : main.c + cc -c main.c + +main_scene.o : main_scene.c + cc -c main_scene.c + +data_only.o : data_only.c + cc -c data_only.c + +do_adjust.o : do_adjust.c + cc -c do_adjust.c + +clean : + rm *.o *.txt os_stat_blongm os_stat_data os_stat_test os_stat_scene_nohook .data* .*.o.cmd .*.ko.cmd .*.c.swp .*.mod.cmd + + diff --git a/ops/os_stat/os_stat_user/README b/ops/os_stat/os_stat_user/README new file mode 100644 index 0000000000000000000000000000000000000000..67fa3e535d3ef1260063d556f20ba1f908b9144d --- /dev/null +++ b/ops/os_stat/os_stat_user/README @@ -0,0 +1,111 @@ +欢迎大家参与开发 +用户态自动化扫描工具:(需要配合内核态模块化工具使用) +1.insmod os_aware.ko +2.执行用户态工具 + +用户态工具用法: +./stat.sh 扫描各个系统调用/缺页异常;并且输出排名前十的syscall,输出缺页异常数据; +目的达到通过扫描找到最热点 + +单点使用: +./os_stat function #扫描一个函数,并且输出性能数据; +例如: 跟踪vfs_read的性能数据: +# ./os_stat vfs_read +--------syscall stat----------- +nr: 201, num: 5961, ave latency: 671(ns), __x64_sys_time+0x0/0x40 +nr: 232, num: 2244, ave latency: 8724(ns), __x64_sys_epoll_wait+0x0/0x30 +nr: 96, num: 2075, ave latency: 11(ns), __x64_sys_gettimeofday+0x0/0xc0 +nr: 257, num: 966, ave latency: 782(ns), __x64_sys_openat+0x0/0x30 +nr: 0, num: 965, ave latency: 1077(ns), __x64_sys_read+0x0/0x20 +nr: 3, num: 925, ave latency: 147(ns), __x64_sys_close+0x0/0x50 +nr: 23, num: 904, ave latency: 20(ns), __x64_sys_select+0x0/0x30 +nr: 4, num: 568, ave latency: 0(ns), __x64_sys_newstat+0x0/0x20 +nr: 9, num: 276, ave latency: 66(ns), __x64_sys_mmap+0x0/0x40 +--------page fault stat----------- +num: 7086, ave latency: 1640(ns), do_page_fault + func:vfs_read(), num: 4729, ave latency: 3393(ns) + +TODO list: +1.task级别性能追踪支持 ----------------------------支持中; +2.cflow同时扫描net/*/, fs/*/时,速度比较慢,需要区分; +3.历史性能对比 +4.性能监控,本工具可以用来监控性能, + 与运维工具集成实时监控某一个函数的性能 +5.内核模块部分,函数表对于parent/grand的hook正确性, + 保证sprintf指针正确性,保证内核稳定; +6.增加可选功能,直接定制追踪某一领域功能,例如网络/文件系统/调度等性能,协助零门槛使用 +7.根据当前各个子系统参数进行自动调参优化,选择不同场景最佳参数; + +这是一次测试情况: +./stat.sh os_aware_0802_001.ko /data/tkernel4/ vfs_read fs & + +******real index: 1, index: 1, level: 0, num:: 803038, latency: 3312 ns, total latency:2659661856 ns, func:vfs_read, origin:vfs_read +******real index: 5, index: 5, level: 1, num:: 773481, latency: 657 ns, total latency: 508177017 ns, func:rw_verify_area, origin:rw_verify_area +******real index: 424, index: 6, level: 1, num:: 838819, latency: 3673 ns, total latency:3080982187 ns, func:__vfs_read, origin:__vfs_read +******real index: 425, index: 424, level: 2, num:: 7, latency: 1341 ns, total latency: 9387 ns, func:file->f_op->read, origin:proc_sys_read +******real index: 427, index: 427, level: 2, num:: 871376, latency: 3398 ns, total latency:2960935648 ns, func:new_sync_read, origin:new_sync_read +******real index: 434, index: 433, level: 4, num:: 207, latency: 2860 ns, total latency: 592020 ns, func:file->f_op->read_iter, origin:sock_read_iter +******real index: 434, index: 434, level: 4, num:: 793871, latency: 3010 ns, total latency:2389551710 ns, func:file->f_op->read_iter, origin:ext4_file_read_iter +******real index: 1, index: 0, level: 0, num:: 848572, latency: 3184 ns, total latency:2701853248 ns, func:ext4_file_read_iter, origin:ext4_file_read_iter +******real index: 1, index: 1, level: 0, num:: 641625, latency: 3587 ns, total latency:2301508875 ns, func:ext4_file_read_iter, origin:ext4_file_read_iter +******real index: 4148, index: 10, level: 1, num:: 629895, latency: 3183 ns, total latency:2004955785 ns, func:generic_file_read_iter, origin:generic_file_read_iter +******real index: 4535, index: 4156, level: 2, num:: 818727, latency: 3343 ns, total latency:2737004361 ns, func:mapping->a_ops->direct_IO, origin:ext4_file_read_iter +******real index: 4538, index: 4159, level: 2, num:: 831064, latency: 2967 ns, total latency:2465766888 ns, func:generic_file_buffered_read, origin:generic_file_buffered_read +******real index: 4862, index: 4542, level: 3, num:: 1734, latency: 425313 ns, total latency: 737492742 ns, func:page_cache_sync_readahead, origin:page_cache_sync_readahead +******real index: 5019, index: 4544, level: 3, num:: 330, latency: 261 ns, total latency: 86130 ns, func:put_page, origin:put_page +******real index: 5100, index: 4545, level: 3, num:: 1861, latency: 106748 ns, total latency: 198658028 ns, func:page_cache_async_readahead, origin:page_cache_async_readahead +******real index: 5297, index: 4552, level: 3, num:: 126441, latency: 61634 ns, total latency:7793064594 ns, func:unlock_page, origin:unlock_page +******real index: 5342, index: 4557, level: 3, num:: 5990121, latency: 1855 ns, total latency:11111674455 ns, func:mark_page_accessed, origin:mark_page_accessed +******real index: 5599, index: 4558, level: 3, num:: 5388318, latency: 1696 ns, total latency:9138587328 ns, func:copy_page_to_iter, origin:copy_page_to_iter +******real index: 5685, index: 4566, level: 3, num:: 204282, latency: 1868 ns, total latency: 381598776 ns, func:add_to_page_cache_lru, origin:add_to_page_cache_lru +******real index: 5351, index: 5344, level: 4, num:: 3458651, latency: 836 ns, total latency:2891432236 ns, func:mem_cgroup_from_task, origin:mem_cgroup_from_task +******real index: 5389, index: 5356, level: 4, num:: 562230, latency: 957 ns, total latency: 538054110 ns, func:activate_page, origin:activate_page +******real index: 5520, index: 5359, level: 4, num:: 629171, latency: 927 ns, total latency: 583241517 ns, func:workingset_activation, origin:workingset_activation +******real index: 5682, index: 5367, level: 4, num:: 293079, latency: 1734 ns, total latency: 508198986 ns, func:__page_cache_alloc, origin:__page_cache_alloc +******real index: 5694, index: 5370, level: 4, num:: 3528308, latency: 740 ns, total latency:2610947920 ns, func:mem_cgroup_from_task, origin:mem_cgroup_from_task +******real index: 5725, index: 5378, level: 4, num:: 216432, latency: 1799 ns, total latency: 389361168 ns, func:__add_to_page_cache_locked, origin:__add_to_page_cache_locked +******real index: 6156, index: 5383, level: 4, num:: 92091, latency: 332 ns, total latency: 30574212 ns, func:lru_cache_add, origin:lru_cache_add +******real index: 6230, index: 5385, level: 4, num:: 630948, latency: 693 ns, total latency: 437246964 ns, func:touch_atime, origin:touch_atime +******real index: 5413, index: 5382, level: 5, num:: 206030, latency: 7053 ns, total latency:1453129590 ns, func:pagevec_lru_move_fn, origin:pagevec_lru_move_fn +******real index: 5443, index: 5383, level: 5, num:: 628557, latency: 254 ns, total latency: 159653478 ns, func:__activate_page, origin:__activate_page +******real index: 5481, index: 5386, level: 5, num:: 1248, latency: 168 ns, total latency: 209664 ns, func:lock_page_lruvec_irq, origin:lock_page_lruvec_irq +******real index: 5535, index: 5396, level: 5, num:: 888233, latency: 651 ns, total latency: 578239683 ns, func:mem_cgroup_page_lruvec, origin:mem_cgroup_page_lruvec +******real index: 5547, index: 5398, level: 5, num:: 672510, latency: 696 ns, total latency: 468066960 ns, func:workingset_age_nonresident, origin:workingset_age_nonresident +******real index: 5727, index: 5436, level: 5, num:: 3861245, latency: 514 ns, total latency:1984679930 ns, func:PageHuge, origin:PageHuge +******real index: 5748, index: 5442, level: 5, num:: 229352, latency: 460 ns, total latency: 105501920 ns, func:mem_cgroup_charge, origin:mem_cgroup_charge +******real index: 5860, index: 5445, level: 5, num:: 781026, latency: 481 ns, total latency: 375673506 ns, func:mem_cgroup_shrink_pagecache, origin:mem_cgroup_shrink_pagecache +******real index: 5889, index: 5446, level: 5, num:: 224191, latency: 614 ns, total latency: 137653274 ns, func:xa_get_order, origin:xa_get_order +******real index: 5896, index: 5449, level: 5, num:: 313594, latency: 697 ns, total latency: 218575018 ns, func:xa_load, origin:xa_load +******real index: 5905, index: 5456, level: 5, num:: 89883, latency: 321 ns, total latency: 28852443 ns, func:xas_store, origin:xas_store +******real index: 5917, index: 5460, level: 5, num:: 427102, latency: 181 ns, total latency: 77305462 ns, func:xas_nomem, origin:xas_nomem +******real index: 5918, index: 5461, level: 5, num:: 148392, latency: 723 ns, total latency: 107287416 ns, func:mem_cgroup_uncharge, origin:mem_cgroup_uncharge +******real index: 5946, index: 5463, level: 5, num:: 226, latency: 249 ns, total latency: 56274 ns, func:put_page, origin:put_page +******real index: 6114, index: 5479, level: 5, num:: 834536, latency: 721 ns, total latency: 601700456 ns, func:workingset_age_nonresident, origin:workingset_age_nonresident +******real index: 6185, index: 5495, level: 5, num:: 7421, latency: 2223 ns, total latency: 16496883 ns, func:__pagevec_lru_add, origin:__pagevec_lru_add +******real index: 6232, index: 5498, level: 5, num:: 840550, latency: 687 ns, total latency: 577457850 ns, func:atime_needs_update, origin:atime_needs_update +******real index: 6250, index: 5500, level: 5, num:: 179, latency: 381 ns, total latency: 68199 ns, func:__mnt_want_write, origin:__mnt_want_write +******real index: 6265, index: 5501, level: 5, num:: 931732, latency: 485 ns, total latency: 451890020 ns, func:current_time, origin:current_time +******real index: 6277, index: 5503, level: 5, num:: 128, latency: 387 ns, total latency: 49536 ns, func:__mnt_drop_write, origin:__mnt_drop_write +******real index: 5782, index: 5746, level: 6, num:: 467728, latency: 248 ns, total latency: 115996544 ns, func:get_mem_cgroup_from_mm, origin:get_mem_cgroup_from_mm +******real index: 5790, index: 5747, level: 6, num:: 254173, latency: 412 ns, total latency: 104719276 ns, func:try_charge, origin:try_charge +******real index: 5832, index: 5751, level: 6, num:: 214508, latency: 369 ns, total latency: 79153452 ns, func:mem_cgroup_charge_statistics, origin:mem_cgroup_charge_statistics +******real index: 5835, index: 5752, level: 6, num:: 411658, latency: 439 ns, total latency: 180717862 ns, func:memcg_check_events, origin:memcg_check_events +******real index: 5903, index: 5776, level: 6, num:: 90272, latency: 341 ns, total latency: 30782752 ns, func:xas_store, origin:xas_store +******real index: 5923, index: 5781, level: 6, num:: 105835, latency: 513 ns, total latency: 54293355 ns, func:uncharge_page, origin:uncharge_page +******real index: 5933, index: 5782, level: 6, num:: 77310, latency: 1198 ns, total latency: 92617380 ns, func:uncharge_batch, origin:uncharge_batch +******real index: 5958, index: 5786, level: 6, num:: 67006, latency: 442 ns, total latency: 29616652 ns, func:__put_page, origin:__put_page +******real index: 6041, index: 5820, level: 6, num:: 1083, latency: 599 ns, total latency: 648717 ns, func:idr_find, origin:idr_find +******real index: 6095, index: 5841, level: 6, num:: 30, latency: 215 ns, total latency: 6450 ns, func:node_page_state, origin:node_page_state +******real index: 6133, index: 5857, level: 6, num:: 877351, latency: 561 ns, total latency: 492193911 ns, func:mem_cgroup_page_lruvec, origin:mem_cgroup_page_lruvec +******real index: 6206, index: 5881, level: 6, num:: 503876, latency: 1348 ns, total latency: 679224848 ns, func:release_pages, origin:release_pages +******real index: 6238, index: 5886, level: 6, num:: 895956, latency: 477 ns, total latency: 427371012 ns, func:current_time, origin:current_time +******real index: 6247, index: 5889, level: 6, num:: 415, latency: 397 ns, total latency: 164755 ns, func:__sb_start_write, origin:__sb_start_write +******real index: 6266, index: 5900, level: 6, num:: 833327, latency: 544 ns, total latency: 453329888 ns, func:ktime_get_coarse_real_ts64, origin:ktime_get_coarse_real_ts64 +******real index: 6269, index: 5903, level: 6, num:: 878949, latency: 556 ns, total latency: 488695644 ns, func:timestamp_truncate, origin:timestamp_truncate +******real index: 6274, index: 5904, level: 6, num:: 766, latency: 764 ns, total latency: 585224 ns, func:generic_update_time, origin:generic_update_time +******real index: 6284, index: 5909, level: 6, num:: 288, latency: 317 ns, total latency: 91296 ns, func:__sb_end_write, origin:__sb_end_write +******real index: 6216, index: 6216, level: 7, num:: 4, latency: 2350 ns, total latency: 9400 ns, func:__put_compound_page, origin:__put_compound_page +******real index: 6223, index: 6223, level: 7, num:: 498756, latency: 419 ns, total latency: 208978764 ns, func:mem_cgroup_uncharge_list, origin:mem_cgroup_uncharge_list +******real index: 6224, index: 6224, level: 7, num:: 55042, latency: 542 ns, total latency: 29832764 ns, func:free_unref_page_list, origin:free_unref_page_list +******real index: 6239, index: 6227, level: 7, num:: 849521, latency: 542 ns, total latency: 460440382 ns, func:ktime_get_coarse_real_ts64, origin:ktime_get_coarse_real_ts64 +******real index: 6242, index: 6230, level: 7, num:: 821277, latency: 543 ns, total latency: 445953411 ns, func:timestamp_truncate, origin:timestamp_truncate diff --git a/ops/os_stat/os_stat_user/change.py b/ops/os_stat/os_stat_user/change.py new file mode 100755 index 0000000000000000000000000000000000000000..548d74359b87cc2135cc33d09cf26d928ec797a4 --- /dev/null +++ b/ops/os_stat/os_stat_user/change.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import time + +counter = 0 +start = time.perf_counter() +total_length = 50 + + +def change_file(path, symbol, dest_symbol): + global counter + for root, dirs, files in os.walk(path, topdown=True): + for name in files: + if name.endswith('.c') or name.endswith('.h'): + fname = os.path.join(root, name) + fin = open(fname, "rt") + alllines = fin.readlines() + fin.close() + fout = open(fname, "w+") + for line in alllines: + # replacing the string and write to output file + a = re.sub(symbol, dest_symbol, line) + fout.writelines(a) + fout.close() + counter += 1 + show_progress() + + +def count_files_in_directory(directory, topdown=True): + total_files = 0 + for root, dirs, files in os.walk(directory): + for fname in files: + if fname.endswith('.c') or fname.endswith('.h'): + total_files += 1 + return total_files + + +def show_progress(): + processed = int(float(counter / totol_files) * total_length) + unprocess = total_length - processed + a = "*" * processed + b = "." * unprocess + c = (counter / totol_files) * 100 + dur = time.perf_counter() - start + print("\r{:^3.0f}%[{}->{}]{:.2f}s".format(c, a, b, dur), end = "") + +if __name__ == "__main__": + path = sys.argv[1] + os.chdir(path) + print(f"Start to change file in {path}") + totol_files = count_files_in_directory(path) * 2 + #change_file(path, '__', 'aa') + change_file(path, '->', 'bb') + print(f"\nChange file in {path} finished.\n") diff --git a/ops/os_stat/os_stat_user/change_resume.py b/ops/os_stat/os_stat_user/change_resume.py new file mode 100755 index 0000000000000000000000000000000000000000..d76594136fef50cca1e37511a74a025587e59ce1 --- /dev/null +++ b/ops/os_stat/os_stat_user/change_resume.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import sys +import os +import re + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _f2 = sys.argv[2]; + + fin = open(_f1, "rt") + fout = open(_f2, "wt") + for line in fin: + # replacing the string and write to output file + fout.write(line.replace('bb', '->')) + #closing the input and output files + fin.close() + fout.close() diff --git a/ops/os_stat/os_stat_user/checkconfig_origngrap.py b/ops/os_stat/os_stat_user/checkconfig_origngrap.py new file mode 100755 index 0000000000000000000000000000000000000000..d0cf4b7a826f3ddcbe77d9044dc870dcb494a1ba --- /dev/null +++ b/ops/os_stat/os_stat_user/checkconfig_origngrap.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # get kv + eidx = _ll.find("(") + print("" + _ll); + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + + _kv1 = parse_conf(_f1); diff --git a/ops/os_stat/os_stat_user/data_only.c b/ops/os_stat/os_stat_user/data_only.c new file mode 100644 index 0000000000000000000000000000000000000000..a0a25e702a2286caf2c637d479a2c54bc706eb15 --- /dev/null +++ b/ops/os_stat/os_stat_user/data_only.c @@ -0,0 +1,22 @@ + +#include +#include +#include +#include +#include +#include +#include + +/* + * work with main.c:main, main.c initializes module and context, is main line + * data_only.c:main: here gets data only, which not to initialize context + * and data_only.c:main could work with main.c:main + */ +int main(int argc,char *argv[]) +{ + int ret; + + ret = record_data(0); + + return ret; +} diff --git a/ops/os_stat/os_stat_user/do_adjust.c b/ops/os_stat/os_stat_user/do_adjust.c new file mode 100644 index 0000000000000000000000000000000000000000..dd88d38ce99a2b51aafb3d7fec692b6f5c485647 --- /dev/null +++ b/ops/os_stat/os_stat_user/do_adjust.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "main.h" + +#define SYS_DEBUG_WITH_PARAMETERS 1 +#define SYS_DEBUG_ENABLE_ONLY 2 +#define SYS_DEBUG_HOOK_VAR 3 +#define SYS_DEBUG_HOOK_VAR_MORE_ONE 4 +#define SYS_DEBUG_HOOK_VAR_MORE_TWO 5 +#define FILE_SIZE 256 * 1048576 +char data_type[] = "process access stat"; +static char os_info_file[] = "/var/log/t-ops/os_stat.log"; +static char kernel_info_periodic_data[] = "/proc/sys/os_aware/debug_data"; +static char kernel_info_func_data[] = "/proc/sys/os_aware/func_data"; +static char os_info_total_file[] = "/var/log/t-ops/os_stat_total.log"; + +long get_file_size_stat(const char *filename) { + struct stat st; + + if (stat(filename, &st) == 0) { + return st.st_size; + } else { + perror("Error getting file size using stat"); + return -1; + } +} + +static struct timeval tv_last; +void show_kernel_info(int func_num, struct func_latency *kernel_data, int fd, int fd_total) +{ + int i, sys_fd, ret; + char buf[128]; + unsigned long file_len, offset; + struct timeval tv; + struct func_latency data; + + gettimeofday(&tv, NULL); + + /* recored each functions data */ + for (i = 0; i < func_num; i++) { + if (kernel_data[i].num == 0) + continue; + + memset(buf, 0, 128); + + /* format data */ +#if 0 + if (!strstr((char *)(kernel_data[i].func + (NAME_MAX >> 1)), data_type)) { + sprintf(buf, "\n======:current pid:%8ld, %16s, reserved:%4ld inode number:%10ld, disk:%16s, time:%d(s)\0\n", + kernel_data[i].nr, kernel_data[i].func, kernel_data[i].num, kernel_data[i].latency, + (char *)(kernel_data[i].func + (NAME_MAX >> 1)), tv.tv_sec); + } + else +#endif + if (!strstr((char *)(kernel_data[i].func + (NAME_MAX >> 1)), data_type)) { + sprintf(buf, "\n======:current pid:%8ld, free:%4ld dirty:%10ld, disk:%8s, %8s, time:%d(s), need uses rpm:%d\0\n", + kernel_data[i].nr, kernel_data[i].num, kernel_data[i].latency, + kernel_data[i].func, (char *)(kernel_data[i].func + (NAME_MAX >> 1)), + tv.tv_sec, kernel_data[i].num < 4 * kernel_data[i].latency); + } else { + sprintf(buf, "\n======:current pid:%8ld, %16s, total:%4ld, operation:%32s, time:%d(s)\0\n", + kernel_data[i].nr, kernel_data[i].func, kernel_data[i].num, + (char *)(kernel_data[i].func + (NAME_MAX >> 1)), tv.tv_sec); + } + + file_len = get_file_size_stat(os_info_file); + + /* filesizes need < FILE_SIZE, not to use too many disk space */ + offset = lseek(fd, 0, SEEK_CUR); + if ((offset < file_len) && (file_len < FILE_SIZE)) + lseek(fd, file_len, SEEK_SET); + if (file_len > FILE_SIZE && offset > FILE_SIZE) + lseek(fd, 0, SEEK_SET); + + write(fd, buf, strlen(buf)); + } + + if (tv.tv_sec - tv_last.tv_sec < 100) + return; + tv_last.tv_sec = tv.tv_sec; + tv_last.tv_usec = tv.tv_usec; + + /* get periodic stat data */ + sys_fd = open(kernel_info_periodic_data, O_RDONLY); + if (sys_fd < 0) { + printf("open file error:%d\n", sys_fd); + goto out; + } + + ret = read(sys_fd, &data, sizeof(struct func_latency)); + + /* format data */ + sprintf(buf, "\n======:inode ino:%8ld, %32s, reserved:%4ld, time:%d(s)\0\n", + data.nr, data.func, data.num, tv.tv_sec); + + + /* if no data update, no need to recored */ + if (data.num == 0) + goto not_write; + + + /* filesizes need < FILE_SIZE, not to use too many disk space */ + file_len = get_file_size_stat(os_info_total_file); + offset = lseek(fd_total, 0, SEEK_CUR); + if (file_len > FILE_SIZE && offset > FILE_SIZE) + lseek(fd_total, 0, SEEK_SET); + + write(fd_total, buf, strlen(buf)); + +not_write: + close(sys_fd); +out: + return; +} + +int record_data(int cycle) +{ + int file_fd, file_fd_total, fd, ret; + struct func_latency data; + + file_fd = open(os_info_file, O_RDWR | O_CREAT, S_IRWXU | S_IRGRP | S_IROTH); + printf("open os_stat.log %d, %s\n", file_fd, os_info_file); + if (file_fd < 0) { + printf("debug open os_stat.log error:%d, %s\n", fd, os_info_file); + return -EINVAL; + } + + file_fd_total = open(os_info_total_file, O_RDWR | O_CREAT, S_IRWXU | S_IRGRP | S_IROTH); + if (file_fd_total < 0) { + printf("open os_stat_total.log error:%d\n", file_fd_total); + goto out; + } + + fd = open(kernel_info_func_data, O_RDONLY); + if (fd < 0) { + printf("open func_data error:%d\n", fd); + goto close_file; + } + + /* open file, seek to end, write from file end */ + lseek(file_fd_total, 0, SEEK_END); + + /* clear buf */ + memset(&data, 0, sizeof(struct func_latency)); + + do { + + /* get data from kernel */ + ret = read(fd, &data, sizeof(struct func_latency)); + + if (ret <= 0) + break; + + /* write data to log file */ + show_kernel_info(1, &data, file_fd, file_fd_total); + + } while (cycle); + +malloc_fail: + close(fd); +close_file: + close(file_fd_total); +out: + close(file_fd); +} + +int enable_sys_debug(int val) +{ + char buf[4] = {'1'}, ret, fd; + + fd = open("/proc/sys/os_aware/enable_debug", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + if (val > 1) + sprintf(buf, "%d", val); + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int trace_which(char *sysctl, char *value) +{ + int ret, fd; + + fd = open(sysctl, O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, value, strlen(value)); + + close(fd); + return ret; +} +//print:68, ext4/vdb .etc +void main_info(int num, char * parameter, int len, int val) +{ + int i; + char sys[3][64]={"/proc/sys/os_aware/printk_struct_first_name\0", + "/proc/sys/os_aware/printk_struct_last_name\0", + "/proc/sys/os_aware/ftrace_hook_one_var"}; + + enable_sys_debug(val); + + if (val == SYS_DEBUG_ENABLE_ONLY) + return; + + if (val == SYS_DEBUG_WITH_PARAMETERS) { + for(i = 0; i < num; i++) + trace_which(sys[i], ¶meter[i * len]); + } + + if (val == SYS_DEBUG_HOOK_VAR) { + for(i = 0; i < num; i++) + trace_which(sys[2], ¶meter[i * len]); + } + if (val == SYS_DEBUG_HOOK_VAR_MORE_ONE || val == SYS_DEBUG_HOOK_VAR_MORE_TWO) { + for(i = 0; i < 2; i++) { + trace_which(sys[i], ¶meter[i * len]); + } + for(i = 2; i < num; i++) { + trace_which(sys[2], ¶meter[i * len]); + } + } +} diff --git a/ops/os_stat/os_stat_user/get_data.py b/ops/os_stat/os_stat_user/get_data.py new file mode 100755 index 0000000000000000000000000000000000000000..59a88c4481dbab20423a5984377097496fdabf83 --- /dev/null +++ b/ops/os_stat/os_stat_user/get_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _num): + _ret = dict() + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # skip not ave lines + if _ll.find("ave latency") == 0: + continue + if i < int(_num): + i=i+1 + continue + + # get num + eidx = _ll.find("num:") + if eidx == -1 and i == int(_num): + return + tmpidx = _ll.find(", ave") + key = _ll[eidx + 4:tmpidx].strip() + print("{:<40}".format(key)) + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _num = sys.argv[2]; + + _kv1 = parse_conf(_f1, _num); diff --git a/ops/os_stat/os_stat_user/get_func_level.py b/ops/os_stat/os_stat_user/get_func_level.py new file mode 100755 index 0000000000000000000000000000000000000000..8a37e54cc9d38792dc9344cbcc86774e3c499845 --- /dev/null +++ b/ops/os_stat/os_stat_user/get_func_level.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# +#+-ext4_file_read_iter() +# +-file_inode() +# +-unlikely() +# +-ext4_forced_shutdown() +# +-EXT4_SB() +# +-iov_iter_count() +# +-IS_DAX() +# +-ext4_dax_read_iter() +# | +-file_inode() +# | +-inode_trylock_shared() +# | +-inode_lock_shared() +# | +-IS_DAX() +# | +-inode_unlock_shared() +# | +-generic_file_read_iter() +# | +-iov_iter_count() +# | | +-i_size_read() +#第三个参数8:只输出第8行开始, 即ext4_dax_read_iter下一级level的函数, +#第二个参数1:ext4_dax_read_iter level是parent level, 这里是1, 其child 是2, grandson是3,只输出level=2的函数; +#第一个参数来自于cflow生成的文件 func_tree.txt; +#./get_subfunc_level.py func_tree.txt 1 8 +#child: 2 > parent: 1 < grandson: 3 :: -file_inode() +#child: 2 > parent: 1 < grandson: 3 :: -inode_trylock_shared() +#child: 2 > parent: 1 < grandson: 3 :: -inode_lock_shared() +#child: 2 > parent: 1 < grandson: 3 :: -IS_DAX() +#child: 2 > parent: 1 < grandson: 3 :: -inode_unlock_shared() +#child: 2 > parent: 1 < grandson: 3 :: -generic_file_read_iter() +#child: 2 > parent: 1 < grandson: 3 :: -iov_iter_count() +import sys +import os + + +def parse_conf(_f): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # get kv + eidx = _ll.find("(") + if eidx == -1: + print("WARNNING: not = find in " + _ll); + continue + level = 0 + tmpidx = _ll.find("+-") + if tmpidx == -1: + tmpidx = _ll.find("\-") + if tmpidx != -1: + level = tmpidx + tmpidx = tmpidx+2 + key = _ll[tmpidx:eidx].strip() + else: + level = tmpidx + tmpidx = tmpidx+2 + key = _ll[tmpidx:eidx].strip() + val = _ll[eidx+1:].strip() + + print("", level) + break + return _ret + + +def comp_conf(_kv1): + _ret = dict() + _ret['same'] = dict() + _ret['diff'] = dict() + _ret['miss'] = dict() + miss=0 + for _k1, _v1 in _kv1.items(): + _tmp = dict() + _ret['diff'][_k1] = {} + return _ret + + +def output(result): + for _k in ('miss', 'diff', 'same'): + _v = result.get(_k, {}) + for __k, __v in _v.items(): + print("{:<40}".format(__v)) + break + +if __name__ == "__main__": + _f1 = sys.argv[1]; + + _kv1 = parse_conf(_f1); + ret = comp_conf(_kv1) + #output(ret) diff --git a/ops/os_stat/os_stat_user/get_func_name.py b/ops/os_stat/os_stat_user/get_func_name.py new file mode 100755 index 0000000000000000000000000000000000000000..335795733b41a8c4c64a1e6c7d5af9ea2acc8ec5 --- /dev/null +++ b/ops/os_stat/os_stat_user/get_func_name.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _v): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # get kv + eidx = _ll.find("(") + if eidx == -1: + print("WARNNING: not = find in " + _ll); + continue + tmpidx = _ll.find("+-") + if tmpidx == -1: + tmpidx = _ll.find("\-") + if tmpidx != -1: + tmpidx = tmpidx+2 + key = _ll[tmpidx:eidx].strip() + else: + tmpidx = tmpidx+2 + key = _ll[tmpidx:eidx].strip() + val = _ll[eidx+1:].strip() + + # use None to descript no val config + if len(val) == 0: + val = 'None' + + tmpidx = val.find(_v) + if tmpidx != -1: + _ret[key] = val + + break + + return _ret + + +def comp_conf(_kv1): + _ret = dict() + _ret['same'] = dict() + _ret['diff'] = dict() + _ret['miss'] = dict() + miss = 0 + for _k1,_v1 in _kv1.items(): + _tmp = dict() + _ret['diff'][_k1] = {} + return _ret + + +def output(result): + for _k in ('miss', 'diff', 'same'): + _v = result.get(_k, {}) + for __k, __v in _v.items(): + print("{:<40}".format(__k)) + break + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _v1 = sys.argv[2]; + + if os.path.exists(_f1): + _kv1 = parse_conf(_f1, _v1); + ret = comp_conf(_kv1) + output(ret) diff --git a/ops/os_stat/os_stat_user/get_function_subpath.py b/ops/os_stat/os_stat_user/get_function_subpath.py new file mode 100755 index 0000000000000000000000000000000000000000..d2fbcab137d14739e32f8f2c23a02618d9849beb --- /dev/null +++ b/ops/os_stat/os_stat_user/get_function_subpath.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _func): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + eidx = _ll.find(")") + if eidx == -1: + eidx = _ll.find(",") + if eidx == -1: + continue + + substring = _ll[ : eidx + 1].strip() + func_idx = substring.find(_func) + if func_idx == -1: + continue + prev_idx = substring.find(":") + func_string = substring[ prev_idx + 1:func_idx].strip() + if func_string.find("=") != -1: + continue + eidx = _ll.find("(") + if eidx == -1: + continue + + eidx_tmp = substring.find(",") + if eidx_tmp != -1: + substring = _ll[eidx : eidx_tmp].strip() + if substring.find(" ") == -1: + continue + substring = _ll[ : eidx_tmp].strip() + elif eidx_tmp == -1: + eidx_tmp = substring.find("void") + if eidx_tmp == -1: + eidx_tmp = substring.find(")") + if eidx_tmp == -1: + continue + substring = substring[ : eidx_tmp].strip() + if substring.find(" ") == -1: + continue + substring = _ll[eidx : eidx_tmp].strip() + if substring.find(" ") == -1: + continue + substring = _ll[ : eidx_tmp].strip() + + eidx = substring.find(".c") + if eidx == -1: + continue + + substring = substring[ : eidx].strip() + + if substring.find("tools") > 0: + continue + + # get kv + eidx = substring.rfind("/") + key = substring[0:eidx+1].strip() + count = key.count("/") + if count < 4: + continue + print("" + key); + return 1 + + return 0 +def parse_conf_header(_f, _func, symbol): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + eidx = _ll.find(")") + if eidx == -1: + eidx = _ll.find(",") + if eidx == -1: + continue + + substring = _ll[ : eidx + 1].strip() + func_idx = substring.find(_func) + if func_idx == -1: + continue + prev_idx = substring.find(":") + func_string = substring[ prev_idx + 1:func_idx].strip() + if func_string.find("=") != -1: + continue + eidx = _ll.find("(") + if eidx == -1: + continue + + eidx = _ll.find(symbol) + if eidx == -1: + continue + + eidx_tmp = substring.find(")") + if eidx_tmp == -1: + continue + substring = _ll[ : eidx_tmp].strip() + + if substring.find("tools") > 0: + continue + + eidx = substring.find(".c") + if eidx == -1: + eidx = substring.find(".h") + if eidx == -1: + continue + + # get kv + eidx = substring.rfind("/") + key = substring[0:eidx+1].strip() + count = key.count("/") + if count < 4: + continue + print("" + key); + return 1 + + return 0 +if __name__ == "__main__": + _f1 = sys.argv[1]; + _func = sys.argv[2]; + + _ret = parse_conf(_f1, _func); + if _ret == 0: + _ret = parse_conf_header(_f1, _func, "#define"); + if _ret == 0: + parse_conf_header(_f1, _func, "SYSCALL_DEFINE"); diff --git a/ops/os_stat/os_stat_user/get_latency.py b/ops/os_stat/os_stat_user/get_latency.py new file mode 100755 index 0000000000000000000000000000000000000000..67c0d8a8f984da783208e79b92c91e2701ed443b --- /dev/null +++ b/ops/os_stat/os_stat_user/get_latency.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, _num): + _ret = dict() + i=0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # get kv + eidx = _ll.find("ave latency:") + if eidx == -1: + continue + if i < int(_num): + i=i+1 + continue + tmpidx = _ll.find("(ns)") + key = _ll[eidx + 12:tmpidx].strip() + print("{:<40}".format(key)) + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + _num = sys.argv[2]; + + _kv1 = parse_conf(_f1, _num); diff --git a/ops/os_stat/os_stat_user/get_pointer_func.py b/ops/os_stat/os_stat_user/get_pointer_func.py new file mode 100755 index 0000000000000000000000000000000000000000..2d3e95bd5184169dcb132f94a21d1ba4db59b442 --- /dev/null +++ b/ops/os_stat/os_stat_user/get_pointer_func.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import sys +import os +def parse_conf(_f): + _ret = dict() + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # get kv + eidx = _ll.find("func:") + if eidx == -1: + continue + + function = _ll[eidx + 5 : ].strip() + + tmpidx = function.find("(") + if tmpidx == -1: + continue + + key = function[: tmpidx].strip() + print("{:<40}".format(key)) + return + +if __name__ == "__main__": + _f1 = sys.argv[1]; + + _kv1 = parse_conf(_f1); diff --git a/ops/os_stat/os_stat_user/get_subfunc_level_name.py b/ops/os_stat/os_stat_user/get_subfunc_level_name.py new file mode 100755 index 0000000000000000000000000000000000000000..a867f7f49f9cccb902b43b95e88bfabd207d5527 --- /dev/null +++ b/ops/os_stat/os_stat_user/get_subfunc_level_name.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +import sys +import os + + +def parse_conf(_f, upper_level, num, start_num, flags): + _ret = 0 + max_level = -1 + cycle = 0 + this_level_cycle = 0 + total_num = 0 + last_idx = 0 + cur_num = int(num) - int(start_num) + before = 1 + with open(_f, 'r') as _fd: + for _ll in _fd.readlines(): + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + total_num += 1 + if total_num < int(start_num): + continue + + # get kv + eidx = _ll.find("(") + if eidx == -1: + print("WARNNING: not = find in " + _ll); + continue + level = upper_level + tmpidx = _ll.find("+-") + if tmpidx == -1: + tmpidx = _ll.find("\-") + if tmpidx != -1 and eidx != -1: + level = tmpidx >> 1 + key = _ll[0:tmpidx].strip() + val = _ll[tmpidx+2:eidx].strip() + count = 0; + level += count + sub_idx = _ll.find("|") + tmp_level = level + if sub_idx != -1: + tmp_level = (tmpidx - sub_idx) >> 1 + if tmp_level > level: + level = tmp_level + else: + if last_idx == tmpidx: + last_idx = 0 + level = tmpidx >> 1 + key = _ll[0:tmpidx].strip() + val = _ll[tmpidx+2:eidx].strip() + sub_idx = _ll.find("|") + tmp_level = level + if sub_idx != -1: + tmp_level = (tmpidx - sub_idx) >> 1 + if tmp_level > level: + level = tmp_level + if last_idx != 0: + level += 1 + cycle += 1 + if int(flags) != 20: + if level == int(upper_level) and cycle > 1: + break + if level == int(upper_level): + continue + if level >= int(upper_level) + 2: + continue + if level <= int(upper_level) and this_level_cycle > int(cur_num): + break + + before = 0 + if level <= int(upper_level): + continue + this_level_cycle += 1 + if this_level_cycle < int(cur_num): + continue + if this_level_cycle > int(cur_num) and int(cur_num) != 0: + break + if int(flags) == 1: + print(val) + elif int(flags) == 0: + print("NULL") + else: + print(total_num) + + if max_level < level: + max_level = level + break + + return _ret + +if __name__ == "__main__": + + _f1 = sys.argv[1]; + level = sys.argv[2]; + num = sys.argv[3]; + start = sys.argv[4]; + print_flags = sys.argv[5]; + + _kv1 = parse_conf(_f1, level, num, start, print_flags); diff --git a/ops/os_stat/os_stat_user/main.c b/ops/os_stat/os_stat_user/main.c new file mode 100644 index 0000000000000000000000000000000000000000..dacba555b7e921019e0f29d8d8ccda647bf80989 --- /dev/null +++ b/ops/os_stat/os_stat_user/main.c @@ -0,0 +1,1549 @@ +#include +#include +#include +#include +#include +#include +#include + +#define DISK_SDA 8 +#define DISK_DM 251 +#define DISK_VD 253 +#define DISK_NVME 259 +#define DISK_HDD 65 +#define DISK_HDD1 66 +#define DISK_HDD2 67 +#define DISK_HDD3 68 +#define DISK_SDA_NUM 40 +#define DISK_HDD_NUM 60 +#define DISK_HDD_SUBNUM 15 +#define DISK_DM_NUM 16 +#define DISK_VD_NUM 80 +#define DISK_NVME_NUM 80 +#define DISK_TOTAL_NUM (DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM + DISK_NVME_NUM) +#define PRINT_SYSCALL_NUM 12 +#define PRINT_MORE_NUM 13 +#define TRACK_SYSCALL_NUM (PRINT_SYSCALL_NUM + PRINT_MORE_NUM + DISK_TOTAL_NUM) +#define HOOK_FUNC_NUM 50 +#define NAME_LENGTH 256 +enum ftrace_status { + FTRACE_INIT = 0, + FTRACE_KRETPROBE_REGISTER = 1, + FTRACE_REGISTER = 2, + FTRACE_UNREGISTERING = 3, + FTRACE_UNREGISTE_STARTED = 4, + FTRACE_UNREGISTED = 5, + FTRACE_REGISTER_FAILED = 6 +}; + +/* malloc 7 more, to stat max/min .etc */ +enum data_stat_type { + DATA_MAX=0, + DATA_MIN, + DATA_TOTAL, + DATA_AVE, + DATA_P90, + DATA_P95, + DATA_P99, + DATA_NR +}; + +struct func_latency { + unsigned long nr; + long num; + unsigned long latency; + unsigned long block_latency; + char func[NAME_LENGTH]; +}; +unsigned long get_cpu_frequency() +{ + FILE *fp; + char buffer[128]; + char frequency[16]; + + fp = fopen("/proc/cpuinfo", "r"); + if (fp == NULL) { + perror("fopen"); + return EXIT_FAILURE; + } + + while (fgets(buffer, sizeof(buffer), fp) != NULL) { + if(strstr(buffer, "cpu MHz")){ + strncpy(frequency, buffer + 10, strlen(buffer) - 10); + fclose(fp); + return atof(frequency); + } + } + + fclose(fp); + return 0; +} + +int control_os_stat(char *buf, char *item) +{ + int ret, fd; + char sysctl_interface[128] = "/proc/sys/os_aware/"; + + strcat(sysctl_interface, item); + + fd = open(sysctl_interface, O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + lseek(fd, 0, SEEK_SET); + + ret = read(fd, buf, strlen(buf)); + if (ret <= 0) { + printf("read error:%d\n", ret); + close(fd); + return ret; + } + + close(fd); + return ret; +} +int enable_track_proc(char *proc) +{ + int ret, fd; + + + fd = open("/proc/sys/os_aware/proc_comm", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, proc, strlen(proc)); + + close(fd); + return ret; +} +int enable_block_stat(int block) +{ + char buf[4] = {'1'}, ret, fd; + + if (block == 0) + return 0; + + fd = open("/proc/sys/os_aware/enable_stat_block", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, sizeof(*buf)); + + close(fd); + return ret; +} +int set_monitor_sample(int sample_rate) +{ + int ret, fd; + char buf[8]; + + if (sample_rate <= 1) + return 0; + + fd = open("/proc/sys/os_aware/monitor_sample_rate", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + sprintf(buf, "%d", sample_rate); + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} +int enable_sys_stat() +{ + char buf[4] = {'1'}, ret, fd; + + fd = open("/proc/sys/os_aware/system_hook_function", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, sizeof(*buf)); + + close(fd); + return ret; +} +int disable_sys_stat() +{ + char buf[4] = {'1'}, ret, fd; + + fd = open("/proc/sys/os_aware/system_unhook_function", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, sizeof(*buf)); + + close(fd); + return ret; +} +int enable_os_stat() +{ + char buf[4] = {'1'}, ret, fd; + + fd = open("/proc/sys/os_aware/enable", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, sizeof(*buf)); + + lseek(fd, 0, SEEK_SET); + + ret = read(fd, buf, sizeof(*buf)); + if (ret <= 0) { + printf("read error:%d\n", ret); + close(fd); + return ret; + } + + + close(fd); + return ret; +} + +int register_kret_ftrace_func(char *buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/kprobe_register_func", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int unregister_one_func(char *buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/kprobe_unregister_func", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int get_kretprobe_register_status(void) +{ + int ret, fd; + char success[4]; + + fd = open("/proc/sys/os_aware/kret_probe_success", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return 0; + } + + ret = read(fd, success, sizeof(success)); + + close(fd); + + if (ret > 0) + return atoi(success); + + return ret; +} + +int get_ftrace_register_status(void) +{ + int ret, fd; + char success[4]; + + + fd = open("/proc/sys/os_aware/ftrace_success", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return 0; + } + + ret = read(fd, success, sizeof(success)); + + close(fd); + + if (ret > 0) + ret = atoi(success); + + return ret; +} + +int get_register_status(void) +{ + int ret, fd; + + ret = get_kretprobe_register_status(); + if (ret == FTRACE_KRETPROBE_REGISTER) + return ret; + + + ret = get_ftrace_register_status(); + if (ret == FTRACE_REGISTER) + return ret; + if (ret == FTRACE_REGISTER_FAILED) + return ret; + + return 0; +} + +int test_status(int status) +{ + int ret = 0, fd, i; + char success[4]; + + fd = open("/proc/sys/os_aware/ftrace_success", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + while (1) { + + ret = read(fd, success, sizeof(success)); + if (ret <= 0) + return -EINVAL; + + if (atoi(success) == status) + break; + if (atoi(success) == FTRACE_REGISTER_FAILED) + break; + + lseek(fd, 0, SEEK_SET); + usleep(20000); + } + + ret = atoi(success); + + return ret; +} + +int register_ftrace_func(char *buf) +{ + int ret, fd; + char success[4]; + + fd = open("/proc/sys/os_aware/ftrace_hook_function", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + + ret = test_status(FTRACE_REGISTER); + + if (ret != FTRACE_REGISTER) + ret = -EINVAL; + + return ret; +} + +int register_one_ftrace_func(char *buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/ftrace_hook_one_function", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int unregister_ftrace_func(char *buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/ftrace_unhook_function", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int read_one_func_data(struct func_latency *test_data, int func_num) +{ + int fd, ret; + + fd = open("/proc/sys/os_aware/func_data", O_RDONLY); + if (fd < 0) { + printf("open func_data error:%d\n", fd); + return 0; + } + + ret = read(fd, test_data, func_num * sizeof(struct func_latency)); + if (ret <= 0) + printf("read func_data error:%d, %d\n", ret, func_num); + + close(fd); + return ret; +} + +int write_func_pointer(char *buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/write_func_pointer", O_RDWR); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = write(fd, buf, strlen(buf)); + + close(fd); + return ret; +} + +int get_func_pointer(char **buf) +{ + int ret, fd; + + fd = open("/proc/sys/os_aware/get_func_pointer", O_RDONLY); + if (fd < 0) { + printf("open error:%d\n", fd); + return fd; + } + + ret = read(fd, *buf, 64); + + close(fd); + return ret; +} + +int process_func_pointer(char *name, char *parent_name, char *origin_name) +{ + int ret; + char buf[4] = {'2'}; + char *ptr; + + if (!strstr(name, "->") && !strstr(name, "INDIRECT_CALL_INET") && !strstr(name, "INDIRECT_CALL_2")) + return 0; + if (strstr(parent_name, "->")) + return -1; + + ret = register_ftrace_func(parent_name); + if (ret < 0) + goto return_ret; + + write_func_pointer(name); + + control_os_stat(buf, "control_stat"); + usleep(1000000); + + ret = get_func_pointer(&origin_name); + + memset(buf, '0', sizeof(*buf)); + control_os_stat(buf, "control_stat"); + + ptr = strchr(origin_name, '+'); + if (ptr) + *ptr = '\0'; +return_ret: + unregister_ftrace_func(parent_name); + + return ret; +} +/* catch system info, top syscall maybe different, need sort fisrt */ +void data_sort_syscall(struct func_latency arr[], int len, int func_num) +{ + int i, j = 0, m, k, nr = 0, last_nr = -1, index, n, h, q, t, old_index = -1; + int num_arr[PRINT_SYSCALL_NUM], g; + + n = (len + 1 ) * func_num; + for (i = 0; i < PRINT_SYSCALL_NUM; i++) + arr[n + i].num = -1; + + /* see syscall [0, PRINT_SYSCALL_NUM] rows */ + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + j = i; + /* [len, DATA_NR] save max/min/p90 .etc */ + for (m = 0; m < len - DATA_NR; m++) { + if (arr[j].num == 0) + goto next; + for(k = 0; k <= i; k++) { + if (arr[n + k].num == -1) + break; + if (arr[j].nr == arr[arr[n + k].num].nr) { + if (arr[j].num > arr[arr[n + k].num].num) + arr[n + k].num = j; + goto next; + } + } + + if (arr[n + i].num != -1 && arr[j].nr == arr[arr[n + i].num].nr) { + if (arr[j].num > arr[arr[n + i].num].num) + arr[n + i].num = j; + } + else { + index = j; + nr = arr[j].nr; + last_nr = nr; + for(k = i; k < PRINT_SYSCALL_NUM; k++) { + if (arr[n + k].num == -1 && nr != -1) { + arr[n + k].num = index; + arr[n + k + PRINT_SYSCALL_NUM].num = nr; + break; + } + if (nr != -1 && nr == arr[arr[n + k].num].nr && arr[index].num > arr[arr[n + k].num].num) { + arr[n + k].num = index; + arr[n + k + PRINT_SYSCALL_NUM].num = nr; + nr = -1; + continue; + } + if (arr[n + k].num != -1 && arr[arr[n + k].num].nr == last_nr) { + arr[n + k].num = -1; + arr[n + k + PRINT_SYSCALL_NUM].num = -1; + } + if (arr[n + k].num != -1 && nr != -1 && nr != arr[arr[n + k].num].nr) { + if (arr[index].num > arr[arr[n + k].num].num) { + nr = arr[n + k + PRINT_SYSCALL_NUM].num; + arr[n + k + PRINT_SYSCALL_NUM].num = arr[index].nr; + old_index = arr[n + k].num; + arr[n + k].num = index; + index = old_index; + } + } + } + } +next: + j = j + func_num; + } + } + + j = len * func_num; + k = (len + 2 ) * func_num; + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + if (arr[n + i].num == -1) + return; + arr[j + i].num = arr[arr[n + i].num].num; + arr[j + i].nr = arr[arr[n + i].num].nr; + arr[j + i].latency = arr[arr[n + i].num].latency; + arr[j + i].block_latency = arr[arr[n + i].num].block_latency; + strcpy(arr[j + i].func, arr[arr[n + i].num].func); + + for(h = 0; h < (len - DATA_NR); h++) { + m = h * func_num + i; + for (q = 0; q < PRINT_SYSCALL_NUM; q++) { + /* stat max latency not based num order, whole max */ + if ((arr[m].latency <= arr[k + q].latency) && arr[m].nr == arr[k + q].nr) + break; + if ((arr[k + q].latency == 0 || (arr[m].latency > arr[k + q].latency))) { + if (arr[m].nr == arr[k + q].nr) + goto update_latency; + g = PRINT_SYSCALL_NUM -1; + for (t = q + 1; t < PRINT_SYSCALL_NUM -1; t++) { + if (arr[k + t].nr == arr[m].nr) { + g = t; + break; + } + } + for (t = g; t > q; t--) { + arr[k + t].num = arr[k + t - 1].num; + arr[k + t].nr = arr[k + t - 1].nr; + arr[k + t].latency = arr[k + t - 1].latency; + arr[k + t].block_latency = arr[k + t - 1].block_latency; + strcpy(arr[k + t].func, arr[k + t - 1].func); + } +update_latency: + arr[k + q].num = arr[m].num; + arr[k + q].nr = arr[m].nr; + arr[k + q].latency = arr[m].latency; + arr[k + q].block_latency = arr[m].block_latency; + strcpy(arr[k + q].func, arr[m].func); + break; + } + } + for (q = 0; q < PRINT_SYSCALL_NUM; q++) { + /* stat max block latency not based num order, whole max */ + if ((arr[m].block_latency <= arr[k + func_num + q].block_latency) && arr[m].nr == arr[k + func_num + q].nr) + break; + if ((arr[k + func_num + q].block_latency == 0 || arr[m].block_latency > arr[k + func_num + q].block_latency)) { + if (arr[m].nr == arr[k + func_num + q].nr) + goto update_block; + g = PRINT_SYSCALL_NUM -1; + for (t = q + 1; t < PRINT_SYSCALL_NUM -1; t++) { + if (arr[k + func_num + t].nr == arr[m].nr) { + g = t; + break; + } + } + for (t = g; t > q; t--) { + arr[k + func_num + t].num = arr[k + func_num + t - 1].num; + arr[k + func_num + t].nr = arr[k + func_num + t - 1].nr; + arr[k + func_num + t].latency = arr[k + func_num + t - 1].latency; + arr[k + func_num + t].block_latency = arr[k + func_num + t - 1].block_latency; + strcpy(arr[k + func_num + t].func, arr[k + func_num + t - 1].func); + } +update_block: + arr[k + func_num + q].num = arr[m].num; + arr[k + func_num + q].nr = arr[m].nr; + arr[k + func_num + q].latency = arr[m].latency; + arr[k + func_num + q].block_latency = arr[m].block_latency; + strcpy(arr[k + func_num + q].func, arr[m].func); + break; + } + } + } + } +} +void data_sort(struct func_latency arr[], int len, int func_num, int order, int syscall) +{ + unsigned long total_num = 0, total_nr = 0, total_latency = 0; + unsigned long total_block_latency = 0, min, max, ave; + int i, j, m, p90, p95, p99, nr = -1, max_index, total = 1, n, k; + int p90_index, p95_index, p99_index; + + if (syscall && order > 0) + nr = arr[(len - DATA_NR - 1) * func_num + order - 1].nr; + + max_index = len * func_num; + n = (len + 1 ) * func_num + order; + + for (i = 0; i < len - DATA_NR - 1; i++) { + for (m = 0; m < len - DATA_NR - 1 - i; m++) { + j = m * func_num + order; + if (syscall && order < PRINT_SYSCALL_NUM && arr[j].nr == nr) { + arr[j].num = 0; + arr[j].latency = 0; + arr[j].block_latency = 0; + continue; + } + if (syscall && order < PRINT_SYSCALL_NUM && arr[j].nr != arr[max_index + order].nr && + (arr[j + func_num].nr == arr[max_index + order].nr)) + continue; + if ((arr[j].latency > arr[j + func_num].latency) + || (syscall && order < PRINT_SYSCALL_NUM && arr[j + func_num].nr != arr[max_index + order].nr)) { + unsigned long latency = arr[j].latency; + unsigned long num = arr[j].num; + unsigned long nr = arr[j].nr; + unsigned long block_latency = arr[j].block_latency; + char func[NAME_LENGTH]; + strcpy(func, arr[j].func); + arr[j].nr = arr[j + func_num].nr; + arr[j].num = arr[j + func_num].num; + arr[j].latency = arr[j + func_num].latency; + arr[j].block_latency = arr[j + func_num].block_latency; + strcpy(arr[j].func, arr[j + func_num].func); + + arr[j + func_num].nr = nr; + arr[j + func_num].num = num; + arr[j + func_num].latency = latency; + arr[j + func_num].block_latency = block_latency; + strcpy(arr[j + func_num].func, func); + } + } + } + + /* stat max block latency based call num order*/ + for (i = 0; i < len - DATA_NR; i++) { + m = i * func_num + order; + if ((arr[n].block_latency == 0 || arr[m].block_latency > arr[n].block_latency) + && (!syscall || order >= PRINT_SYSCALL_NUM + || arr[m].nr == arr[max_index + order].nr)) { + arr[n].num = arr[m].num; + arr[n].nr = arr[m].nr; + arr[n].latency = arr[m].latency; + arr[n].block_latency = arr[m].block_latency; + strcpy(arr[n].func, arr[m].func); + } + } + + j = (len - DATA_NR - 1) * func_num + order; + for (m = 0; m < len - DATA_NR; m++) { + i = m * func_num + order; + /* for save memory or more data recored in nr, such as memory */ + if (syscall && (order == PRINT_SYSCALL_NUM + 3 || order == PRINT_SYSCALL_NUM + 4)) { + if ((long)arr[i].nr > 0) + total_nr += arr[i].nr; + } + if ((long)arr[i].num > 0 && (!syscall || order >= PRINT_SYSCALL_NUM || arr[i].nr == arr[j].nr)) { + total_num += arr[i].num; + total_latency += arr[i].num * arr[i].latency; + total_block_latency += arr[i].num * arr[i].block_latency; + } + } + if (total_nr == 0 && (long)arr[order].nr >= 0) + total_nr = arr[order].nr; + /*max*/ + i = (len - DATA_NR) * func_num + order; + if ((long)arr[j].nr >= 0) + arr[i].nr = arr[j].nr; + if ((long)arr[j].num >= 0) + arr[i].num = arr[j].num; + arr[i].latency = arr[j].latency; + if ((long)arr[j].block_latency > 0) + arr[i].block_latency = arr[j].block_latency; + strcpy(arr[i].func, arr[j].func); + + /*min*/ + j = order; + arr[i + DATA_MIN * func_num].nr = arr[j].nr; + arr[i + DATA_MIN * func_num].num = arr[j].num; + arr[i + DATA_MIN * func_num].latency = arr[j].latency; + arr[i + DATA_MIN * func_num].block_latency = arr[j].block_latency; + strcpy(arr[i + DATA_MIN * func_num].func, arr[j].func); + + /* total */ + arr[i + DATA_TOTAL * func_num].nr = total_nr; + arr[i + DATA_TOTAL * func_num].num = total_num; + arr[i + DATA_TOTAL * func_num].latency = total_latency; + arr[i + DATA_TOTAL * func_num].block_latency = total_block_latency; + + /* ave */ + if (syscall && (order == PRINT_SYSCALL_NUM + 3 || order == PRINT_SYSCALL_NUM + 4)) + total = len - DATA_NR; + if (total == 0) + total = 1; + arr[i + DATA_AVE * func_num].nr = total_nr / total; + arr[i + DATA_AVE * func_num].num = total_num; + if (total_num == 0) + total_num = 1; + arr[i + DATA_AVE * func_num].latency = total_latency / total_num; + arr[i + DATA_AVE * func_num].block_latency = total_block_latency / total_num; + strcpy(arr[i + DATA_AVE * func_num].func, arr[j].func); + /* p90, p95, p99 */ + p90 = (len - DATA_NR) * 9 / 10; + p90_index = p90 * func_num + order; + p95 = (len - DATA_NR) * 95 / 100; + p95_index = p95 * func_num + order; + p99 = (len - DATA_NR) * 99 / 100; + p99_index = p99 * func_num + order; + arr[i + DATA_P95 * func_num].nr = arr[p95_index].nr; + for (m = 0; m < p99; m++ ) { + k = (len - DATA_NR - 1) * func_num + order; + j = m * func_num + order; + if ((long)arr[i].num == 0 || (syscall && order < PRINT_SYSCALL_NUM && arr[i].nr != arr[j].nr)) + continue; + if (m < p90) + arr[i + DATA_P90 * func_num].num += arr[j].num; + if (m < p95) { + arr[i + DATA_P95 * func_num].num += arr[j].num; + if (syscall && (order == PRINT_SYSCALL_NUM + 3 || order == PRINT_SYSCALL_NUM + 4)) + arr[i + DATA_P95 * func_num].nr += arr[j].nr; + } + arr[i + DATA_P99 * func_num].num += arr[j].num; + } + arr[i + DATA_P90 * func_num].latency = arr[p90_index].latency; + arr[i + DATA_P90 * func_num].block_latency = arr[p90_index].block_latency; + strcpy(arr[i + DATA_P90 * func_num].func, arr[p90_index].func); + arr[i + DATA_P95 * func_num].latency = arr[p95_index].latency; + arr[i + DATA_P95 * func_num].block_latency = arr[p95_index].block_latency; + arr[i + DATA_P99 * func_num].latency = arr[p99_index].latency; + arr[i + DATA_P99 * func_num].block_latency = arr[p99_index].block_latency; +} +int read_data(int delay_cycle, int tmp_delay, int fd, int func_num, struct func_latency *data) +{ + int index = 0, ret; + char buf[4] = {'1'}; + + while (delay_cycle > 0) { + control_os_stat(buf, "control_stat"); + usleep(1000); + control_os_stat(buf, "control_stat"); + memset(buf, '0', sizeof(*buf)); + usleep(1000); + control_os_stat(buf, "control_stat"); + buf[0] = '1'; + + usleep(tmp_delay); + + ret = read(fd, &data[index * func_num], func_num * sizeof(struct func_latency)); + if (ret <= 0) { + printf("read error:%d\n", ret); + break; + } + delay_cycle--; + index++; + } + + return index; +} + +void data_sort_many(struct func_latency *data, int delay_cycle, int func_num) +{ + int i; + + data_sort_syscall(data, delay_cycle + DATA_NR, func_num); + for(i = 0; i < func_num; i++) + data_sort(data, delay_cycle + DATA_NR, func_num, i, 1); +} + +void print_info(char *info) +{ + printf("\n\033[22;35m<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m\n"); + printf("\033[22;35m<<<<<<<<<<<<<<<<<<< %16s >>>>>>>>>>>>>>>>>>>\033[0m\n", info); +} +void show_system_info(struct func_latency *data, int func_num, int index) +{ + int i, j, m, n; + + printf("\033[22;35m<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m\n"); + printf("\033[22;35m<< Only block ave delay is longer than before, maybe: mutex/sem lock contention or wait for resource, need to see the block pos >>\033[0m\n"); + printf("\033[22;35m<< AVE delay is longer than before, maybe: spin lock contention or more process running or code need improved, need to see the block pos >>\033[0m\n"); + printf("\033[22;35m<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< syscall stat >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m\n"); + + if (index > 0) { + printf("-----ave latency with its num and block latency , based the same one of num order------\n"); + j = index * func_num; + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + printf("\033[1;32m nr:%4ld; num:%8ld, max latency:%8ld(ns) block:%8ld(ns) %s\033[0m\n", + data[j].nr, + data[j].num, data[j].latency, data[j].block_latency, + data[j].func); + j++; + } + printf("---------max latency with its num and latency, from all data ---------\n"); + j = (index + 9 ) * func_num; + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + printf("\033[1;32m nr:%4ld; num:%8ld, latency:%8ld(ns) max block:%8ld(ns) %s\033[0m\n", + data[j].nr, + data[j].num, data[j].latency, data[j].block_latency, + data[j].func); + j++; + } + printf("---------max block latency with its num and latency, from all data ---------\n"); + j = (index + 10 ) * func_num; + for (i = 0; i < PRINT_SYSCALL_NUM; i++) { + printf("\033[1;32m nr:%4ld; num:%8ld, latency:%8ld(ns) max block:%8ld(ns) %s\033[0m\n", + data[j].nr, + data[j].num, data[j].latency, data[j].block_latency, + data[j].func); + j++; + } + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 0, show block latency */ + print_info("access vm"); + j = (index + 8 ) * func_num + i; + printf("\033[1;32m num:%8ld, max block latency:%8ld(ns),fuzzy stat:%s\033[0m\n", + data[j].num, + data[j].block_latency, + data[j].func); + j = index * func_num + i; + printf("\033[1;32m num:%8ld, ave block latency:%8ld(ns),fuzzy stat:%s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].block_latency, + data[j + DATA_AVE * func_num].func); + i++; + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 1 + 2 */ + print_info("irq"); + j = index * func_num + i; + for (m = 0; m < 2; m++) { + printf("\033[1;32m num:%8ld, ave irq latency:%8ld(ns) %s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].latency, + data[j].func); + i++; + j++; + } + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 3 */ + printf("\n\033[22;35m <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m\n"); + printf("\033[22;35m <<<<<< for numa: numa distribution maybe used with rq running num per node >>>>>\033[0m\n"); + printf("\033[22;35m <<<<<< for syscall/fs: location performance caused by memory allocation or bio or fs >>>>>\033[0m\n"); + printf("\033[22;35m <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< memory >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m\n"); + + j = index * func_num + i; + printf("\033[1;32m num:%8ld; not hit:%8ld; ave latency:%8ld(ns); block:%8ld(ns); %s\033[0m\n", + data[j].num, + data[j].nr, + data[j].latency, + data[j].block_latency, + data[j].func); + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 4 */ + i++; + j = index * func_num + i; + printf("\033[1;32m num:%8ld; size:%8ld(k); ave block:%8ld(ns); %s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].nr, + data[j + DATA_AVE * func_num].block_latency, + data[j + DATA_AVE * func_num].func); + i++; + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 5, + 6 */ + n = (index + 8 ) * func_num + i; + j = index * func_num + i; + m = 0; + for (;i < PRINT_SYSCALL_NUM + PRINT_MORE_NUM - 2; i++) { + printf("\033[1;32m num:%8ld; mem:%8ld(k); ave block:%8ld(ns); node:%8ld, %s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].latency, + data[j + DATA_AVE * func_num].block_latency, + data[j + DATA_AVE * func_num].nr, + data[j].func); + n++; + j++; + m++; + if (m >= 2) { + i++; + break; + } + } + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 7, + 8 */ + print_info("rq running num"); + m = 0; + j = index * func_num + i; + for (;i < PRINT_SYSCALL_NUM + PRINT_MORE_NUM; i++) { + printf("\033[1;32m node:%8ld, nr running:%8ld; ave busy:%8ld; %s\033[0m\n", + data[j + DATA_AVE * func_num].nr, + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].latency, + data[j].func); + j++; + m++; + if (m >= 2) + break; + } + + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 9, + 10 */ + i++; + print_info("bio"); + j = index * func_num + i; + n = (index + 8 ) * func_num + i; + for (m = 0; m < 2; m++) { + printf("\033[1;32m num:%8ld; ave latnecy:%8ld(ns); block:%8ld(ns); %s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].latency, + data[j + DATA_AVE * func_num].block_latency, + data[j].func); + j++; + n++; + i++; + } + /* PRINT_MORE_NUM:PRINT_SYSCALL_NUM + 11, +12 */ + j = index * func_num + i; + n = (index + 8 ) * func_num + i; + for (m = 0; m < 2; m++) { + printf("\033[1;32m num:%8ld; ave block latency:%8ld(ns); %s\033[0m\n", + data[j + DATA_AVE * func_num].num, + data[j + DATA_AVE * func_num].block_latency, + data[j].func); + j++; + n++; + i++; + } + for (; i < TRACK_SYSCALL_NUM; i++) { + m = index * func_num + i; + if (data[m].num > 0) + printf("\033[1;32m num:%8ld; %s\033[0m\n", + data[m].num, + data[m].func); + } + } +} +void show_data(int index, char *func, int func_num, struct func_latency *test_data, unsigned long frequency) +{ + printf("\033[1;32m=== ave ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n", + test_data[index + DATA_AVE * func_num].num, test_data[index + DATA_AVE * func_num].latency * 1000 / frequency, + test_data[index + DATA_AVE * func_num].block_latency * 1000 / frequency, func); + printf("\033[1;32m=== max ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n", + test_data[index].num, test_data[index].latency * 1000 / frequency, test_data[index].block_latency * 1000 / frequency, func); + printf("\033[1;32m=== min ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n", + test_data[index + DATA_MIN * func_num].num, test_data[index + DATA_MIN * func_num].latency * 1000 / frequency, + test_data[index + DATA_MIN * func_num].block_latency * 1000 / frequency, func); + printf("\033[1;32m=== p90 ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n", + test_data[index + DATA_P90 * func_num].num, test_data[index + DATA_P90 * func_num].latency * 1000 / frequency, + test_data[index + DATA_P90 * func_num].block_latency * 1000 / frequency, func); + printf("\033[1;32m=== p95 ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n", + test_data[index + DATA_P95 * func_num].num, test_data[index + DATA_P95 * func_num].latency * 1000 / frequency, + test_data[index + DATA_P95 * func_num].block_latency * 1000 / frequency, func); + printf("\033[1;32m=== p99 ===: num:%8ld, latency:%8ld(ns), block latency:%8ld(ns), %s\033[0m\n\n", + test_data[index + DATA_P99 * func_num].num, test_data[index + DATA_P99 * func_num].latency * 1000 / frequency, + test_data[index + DATA_P99 * func_num].block_latency * 1000 / frequency, func); +} + +void show_hot_path(int func_num, struct func_latency *test_data, unsigned long frequency) +{ + int i; + + for (i = 0; i < func_num; i++) { + printf("\033[1;32m======:num:%8ld, ave latency:%8ld(ns), block ave latency:%8ld(ns), current func:%s()\033[0m\n", + test_data[i].num, test_data[i].latency * 1000 / frequency, + test_data[i].block_latency * 1000 / frequency, test_data[i].func); + } +} + +void show_signal_info(int func_num, struct func_latency *test_data) +{ + int i; + + for (i = 0; i < func_num; i++) { + if (test_data[i].num == 0) + continue; + printf("\033[1;32m======:target pid:%8ld, %s, current process:%d, %s\033[0m\n", + test_data[i].num, (char *)(test_data[i].func + 128), test_data[i].nr, + test_data[i].func); + } +} +static int strisdigit(char *str) +{ + return (strspn(str, "0123456789")==strlen(str)); +} +int main(int argc,char *argv[]) +{ + int fd, ret, i, func_ret, j; + char buf[4] = {'1'}; + char name[NAME_LENGTH] = "vfs_read"; + char origin[NAME_LENGTH] = "vfs_read"; + char proc[NAME_LENGTH] = "0"; + unsigned long func_ptr; + int func_num = 1; + int func_total = 0; + char parent_name[64] = "vfs_read"; + char grand_parent_name[64] = "vfs_read"; + char *more_name; + int print = 0, scene = 0, uses_var = 0; + unsigned long delay = 5, tmp_delay, tmp_cycle = 0, tmp_index = 0; + int block = 0, delay_cycle = 1, index = 0, base_cycle; + int nr_node = 0, hot = 0, sample = 1; + struct func_latency *test_data; + int register_status; + int pointer = 2; + int type; + unsigned long frequency = 1; + int is_digit = 0; + + int opt; + char *string = "0:1:2:3:4:5:6:7:8:9:f:n:i:t:b:h:p:s:x"; + struct option longopts[] = { + {"sc", 1, NULL, 'a'}, + {"10", 1, NULL, 'a'}, + {"11", 1, NULL, 'g'}, + {"12", 1, NULL, 'c'}, + {"13", 1, NULL, 'd'}, + {"14", 1, NULL, 'e'}, + {"15", 1, NULL, 'j'}, + {"16", 1, NULL, 'k'}, + {"17", 1, NULL, 'l'}, + {"18", 1, NULL, 'm'}, + {"19", 1, NULL, 'o'}, + {"20", 1, NULL, 'q'}, + {"21", 1, NULL, 'r'}, + {"22", 1, NULL, 's'}, + {"23", 1, NULL, 'u'}, + {"24", 1, NULL, 'v'}, + {"25", 1, NULL, 'w'}, + {"26", 1, NULL, 'y'}, + {"27", 1, NULL, 'z'}, + {"28", 1, NULL, '~'}, + {"29", 1, NULL, '!'}, + {"30", 1, NULL, '@'}, + {"31", 1, NULL, '#'}, + {"32", 1, NULL, '$'}, + {"33", 1, NULL, '%'}, + {"34", 1, NULL, '^'}, + {"35", 1, NULL, '&'}, + {"36", 1, NULL, '*'}, + {"37", 1, NULL, '('}, + {"38", 1, NULL, ')'}, + {"39", 1, NULL, '-'}, + {"40", 1, NULL, '+'}, + {"var", 1, NULL, '+'}, + {"ht", 1, NULL, 'h'}, + {"de", 1, NULL, 't'}, + {"ptr", 1, NULL, '_'}, + {0, 0, 0, 0}, + }; + + more_name = (char *)malloc(50 * NAME_LENGTH); + if (!more_name) + return -ENOMEM; + + while ((opt = getopt_long(argc, argv, string, longopts, NULL))!= -1) { + switch (opt) { + case 'f': + case '0': + func_ptr = optarg; + strcpy(origin, optarg); + strcpy(name, optarg); + strcpy(&more_name[0], optarg); + func_total++; + break; + case '1': + strcpy(parent_name, optarg); + strcpy(&more_name[NAME_LENGTH], optarg); + func_total++; + break; + case '2': + strcpy(grand_parent_name, optarg); + strcpy(&more_name[NAME_LENGTH * 2], optarg); + func_total++; + break; + case '3': + strcpy(&more_name[NAME_LENGTH * 3], optarg); + func_total++; + break; + case '4': + strcpy(&more_name[NAME_LENGTH * 4], optarg); + func_total++; + break; + case '5': + strcpy(&more_name[NAME_LENGTH * 5], optarg); + func_total++; + break; + case '6': + strcpy(&more_name[NAME_LENGTH * 6], optarg); + func_total++; + break; + case '7': + strcpy(&more_name[NAME_LENGTH * 7], optarg); + func_total++; + break; + case '8': + strcpy(&more_name[NAME_LENGTH * 8], optarg); + func_total++; + break; + case '9': + strcpy(&more_name[NAME_LENGTH * 9], optarg); + func_total++; + break; + case 'a': + is_digit = strisdigit(optarg); + if (is_digit == 0) { + strcpy(&more_name[NAME_LENGTH * 10], optarg); + func_total++; + }else + scene = atoi(optarg); + break; + case 'g': + strcpy(&more_name[NAME_LENGTH * 11], optarg); + func_total++; + break; + case 'c': + strcpy(&more_name[NAME_LENGTH * 12], optarg); + func_total++; + break; + case 'd': + strcpy(&more_name[NAME_LENGTH * 13], optarg); + func_total++; + break; + case 'e': + strcpy(&more_name[NAME_LENGTH * 14], optarg); + func_total++; + break; + case 'j': + strcpy(&more_name[NAME_LENGTH * 15], optarg); + func_total++; + break; + case 'k': + strcpy(&more_name[NAME_LENGTH * 16], optarg); + func_total++; + break; + case 'l': + strcpy(&more_name[NAME_LENGTH * 17], optarg); + func_total++; + break; + case 'm': + strcpy(&more_name[NAME_LENGTH * 18], optarg); + func_total++; + break; + case 'o': + strcpy(&more_name[NAME_LENGTH * 19], optarg); + func_total++; + break; + case 'q': + strcpy(&more_name[NAME_LENGTH * 20], optarg); + func_total++; + break; + case 'r': + strcpy(&more_name[NAME_LENGTH * 21], optarg); + func_total++; + break; + case 's': + is_digit = strisdigit(optarg); + if (is_digit == 0) { + strcpy(&more_name[NAME_LENGTH * 22], optarg); + func_total++; + } else + sample = atoi(optarg); + break; + case 'u': + strcpy(&more_name[NAME_LENGTH * 23], optarg); + func_total++; + break; + case 'v': + strcpy(&more_name[NAME_LENGTH * 24], optarg); + func_total++; + break; + case 'w': + strcpy(&more_name[NAME_LENGTH * 25], optarg); + func_total++; + break; + case 'y': + strcpy(&more_name[NAME_LENGTH * 26], optarg); + func_total++; + break; + case 'z': + strcpy(&more_name[NAME_LENGTH * 27], optarg); + func_total++; + break; + case '~': + strcpy(&more_name[NAME_LENGTH * 28], optarg); + func_total++; + break; + case '!': + strcpy(&more_name[NAME_LENGTH * 29], optarg); + func_total++; + break; + case '@': + strcpy(&more_name[NAME_LENGTH * 30], optarg); + func_total++; + break; + case '#': + strcpy(&more_name[NAME_LENGTH * 31], optarg); + func_total++; + break; + case '$': + strcpy(&more_name[NAME_LENGTH * 32], optarg); + func_total++; + break; + case '%': + strcpy(&more_name[NAME_LENGTH * 33], optarg); + func_total++; + break; + case '^': + strcpy(&more_name[NAME_LENGTH * 34], optarg); + func_total++; + break; + case '&': + strcpy(&more_name[NAME_LENGTH * 35], optarg); + func_total++; + break; + case '*': + strcpy(&more_name[NAME_LENGTH * 36], optarg); + func_total++; + break; + case '(': + strcpy(&more_name[NAME_LENGTH * 37], optarg); + func_total++; + break; + case ')': + strcpy(&more_name[NAME_LENGTH * 38], optarg); + func_total++; + break; + case '-': + strcpy(&more_name[NAME_LENGTH * 39], optarg); + func_total++; + break; + case '+': + strcpy(&more_name[NAME_LENGTH * 40], optarg); + func_total++; + is_digit = strisdigit(optarg); + if (is_digit == 0) { + strcpy(&more_name[NAME_LENGTH * 11], optarg); + func_total++; + }else + uses_var = atoi(optarg); + break; + case 'n': + func_num = atoi(optarg); + break; + case 'i': + print = atoi(optarg); + break; + case 't': + delay = atoi(optarg); + break; + case 'b': + block = atoi(optarg); + break; + case 'h': + hot = atoi(optarg); + break; + case 'p': + strcpy(proc, optarg); + break; + case '_': + pointer = atoi(optarg); + break; + default: + break; + } + } + + ret = enable_os_stat(); + if (ret <= 0) { + printf("enable error:%d\n", ret); + return 0; + } + + while (1) { + register_status = get_register_status(); + if (!register_status || register_status == FTRACE_REGISTER_FAILED) + break; + usleep(100000); + } + enable_track_proc(proc); + enable_block_stat(block); + set_monitor_sample(sample); + + if (print == 2) + ret = control_os_stat(buf, "trace_type"); + /* trace signal only signal = (print - 2) */ + if (scene >= 3 || uses_var) { + char trace_sig[4]; + if (scene >= 3) + sprintf(trace_sig, "%d", scene); + else + sprintf(trace_sig, "%d", SIGRTMAX_ABOVE); + ret = control_os_stat(trace_sig, "trace_type"); + } + + tmp_delay = delay * 1000 * 1000; + delay_cycle = 1; + /* hot = 1: hot path; sample == 100:sample rate. only hot path no need p90 stat */ + if ((hot && sample != 100)) + goto run_test; + /* delay * 1000 * 1000/ 100, s->us */ + tmp_delay = delay * 10000; + /* min 1s */ + if (tmp_delay < 1000000) + tmp_delay = 1000000; + /* max 10s */ + if (tmp_delay > 10000000) + tmp_delay = 10000000; + delay_cycle = delay * 1000 * 1000 / tmp_delay; + if (delay_cycle == 0) + delay_cycle = 1; + + /* each base_cycle show data once, total cycle: delay_cycle */ + base_cycle = delay * 1000000 / (tmp_delay * 20); + /* base_cycle = [5, delay_cycle] */ + if (base_cycle < 5) + base_cycle = 5; + if (delay < 20 || base_cycle > delay_cycle) + base_cycle = delay_cycle; + +run_test: + if (func_num >= HOOK_FUNC_NUM) + func_num = HOOK_FUNC_NUM; + if (print == 1) + func_num = TRACK_SYSCALL_NUM; + test_data = (struct func_latency *)malloc(func_num * sizeof(struct func_latency) * (delay_cycle + 11)); + if (!test_data) + return 0; + memset((void *)test_data, 0, func_num * sizeof(struct func_latency) * (delay_cycle + 11)); + + if (pointer == 0) + goto not_pointer; + + if (print != 1) + goto print_one; + + enable_sys_stat(); + fd = open("/proc/sys/os_aware/data", O_RDONLY); + if (fd < 0) { + printf("open error:%d\n", fd); + goto return_disable; + } + + while (tmp_cycle < delay_cycle) { + + index = read_data(1, delay_cycle * tmp_delay, fd, func_num, &test_data[tmp_index]); + tmp_index += base_cycle * func_num; + + data_sort_many(test_data, 1, func_num); + show_system_info(test_data, func_num, 1); + + tmp_cycle += base_cycle; + } + + close(fd); +return_disable: + disable_sys_stat(); + + return 0; + +print_one: + ret = process_func_pointer(name, parent_name, origin); + if (ret < 0 || ret == 1 || !strncmp(origin, "0x0", 3)) { + memset(origin, 0 , 256); + ret = process_func_pointer(name, grand_parent_name, origin); + } + + /* only transfor pointer to function */ + if (pointer == 1) { + printf("\033[1;32m ======:pointer:%s, current func:%s()\033[0m\n", + name, origin); + return ret; + } + + if (ret < 0) + return ret; + + if (strstr(origin, "->")) + return 0; +not_pointer: + /* wait last status ready */ + ret = get_ftrace_register_status(); + if (ret == FTRACE_UNREGISTERING) + ret = test_status(FTRACE_INIT); + + if (ret != FTRACE_INIT && ret != FTRACE_REGISTER_FAILED) + return -EINVAL; + + if (scene > SIGRTMAX + 2 || uses_var) { + type = scene - SIGRTMAX - 2; + if (type < 0) + type = uses_var; + main_info(func_total - func_num , &more_name[func_num * NAME_LENGTH], NAME_LENGTH, type); + func_total = func_total < func_num?func_total:func_num; + file_fd = open("/var/log/t-ops/os_stat.log", + O_RDWR | O_CREAT, S_IRWXU | S_IRGRP | S_IROTH); + if (file_fd < 0) { + printf("main open os_stat.log error:%d\n", file_fd); + return -EINVAL; + } + file_fd_total = open("/var/log/t-ops/os_stat_total.log", + O_RDWR | O_CREAT, S_IRWXU | S_IRGRP | S_IROTH); + if (file_fd_total < 0) { + close(file_fd); + printf("main open os_stat.log error:%d\n", file_fd_total); + return -EINVAL; + } + } + + /* register functions, which to get latency performance */ + if (pointer == 0) { + if (func_total != func_num) { + printf("func num error: get:%d, may:%d\n", func_total, func_num); + func_total = func_total < func_num?func_total:func_num; + } + i = 1; + while (i < func_total) { + register_one_ftrace_func(&more_name[i * NAME_LENGTH]); + i++; + } + strcpy(origin, &more_name[0]); + register_ftrace_func(&more_name[0]); + } else { + strcpy(origin, &more_name[0]); + func_ret = register_ftrace_func(&more_name[0]); + if (func_ret < 0) + func_ret = register_kret_ftrace_func(origin); + } + + /* wait status ready */ + while (1) { + usleep(10000); + register_status = get_register_status(); + if (register_status == FTRACE_REGISTER_FAILED) + goto out; + if (register_status) + break; + } + + if (!register_status) + goto out; + + /* read data start */ + fd = open("/proc/sys/os_aware/func_data", O_RDONLY); + if (fd < 0) { + printf("open func_data error:%d\n", fd); + goto out; + } + + if (hot == 1) { + tmp_cycle = 0; + base_cycle = delay_cycle; + } + + /* get data to show at once */ + if (print > 2) + base_cycle = 1; + + /* read data and show data */ + while (tmp_cycle < delay_cycle) { + + index = read_data(base_cycle, tmp_delay, fd, func_num, &test_data[tmp_index]); + + frequency = get_cpu_frequency(); + if (frequency == 0) + frequency = 1; + + if (scene > SIGRTMAX + 2){ + show_kernel_info(func_num, &test_data[tmp_index], file_fd, file_fd_total); + } + /* only show signal (such as: kill -9) info in delay*/ + else if (scene > 2 && scene < SIGRTMAX) + show_signal_info(func_num, &test_data[tmp_index]); + /* sort and show data for latency between more functions */ + else if (print == 2) { + j = 1; + for (i = 0; i < func_num; i+=2) { + sprintf(proc, "latency between func:%s()->%s()", test_data[j].func, test_data[j - 1].func); + index = func_num * delay_cycle + j; + data_sort(test_data, delay_cycle + DATA_NR, func_num, j++, 0); + show_data(index, proc, func_num, test_data, frequency); + } + } + /* sort and show data for latency of many functions */ + else if (hot != 1 && func_num > 1) { + for (i = 0; i < func_num; i++) { + data_sort(test_data, delay_cycle + DATA_NR, func_num, i, 0); + j = func_num * delay_cycle + i; + show_data(j, test_data[j].func, func_num, test_data, frequency); + } + } + /* sort and show data for latency for hot path */ + else if (hot) + show_hot_path(func_num, test_data, frequency); + /* sort and show data for latency for one function */ + else { + data_sort(test_data, delay_cycle + DATA_NR, 1, 0, 0); + show_data(delay_cycle, test_data[delay_cycle].func, 1, test_data, frequency); + } + tmp_index += base_cycle * func_num; + tmp_cycle += base_cycle; + } + + /* read data finish */ + close(fd); + +out: + if (scene > SIGRTMAX + 2) { + close(file_fd); + close(file_fd_total); + } + + unregister_one_func(origin); + + free(test_data); + free(more_name); + return func_ret; +} diff --git a/ops/os_stat/os_stat_user/main.h b/ops/os_stat/os_stat_user/main.h new file mode 100644 index 0000000000000000000000000000000000000000..6791105e37aeae5814352475c4e92582a1b9d13e --- /dev/null +++ b/ops/os_stat/os_stat_user/main.h @@ -0,0 +1,53 @@ +#define DISK_SDA 8 +#define DISK_DM 251 +#define DISK_VD 253 +#define DISK_NVME 259 +#define DISK_HDD 65 +#define DISK_HDD1 66 +#define DISK_HDD2 67 +#define DISK_HDD3 68 +#define DISK_SDA_NUM 40 +#define DISK_HDD_NUM 60 +#define DISK_HDD_SUBNUM 15 +#define DISK_DM_NUM 16 +#define DISK_VD_NUM 80 +#define DISK_NVME_NUM 80 +#define DISK_TOTAL_NUM (DISK_SDA_NUM + DISK_HDD_NUM + DISK_DM_NUM + DISK_VD_NUM + DISK_NVME_NUM) +#define PRINT_SYSCALL_NUM 12 +#define PRINT_MORE_NUM 13 +#define TRACK_SYSCALL_NUM (PRINT_SYSCALL_NUM + PRINT_MORE_NUM + DISK_TOTAL_NUM) +#define HOOK_FUNC_NUM 50 +#define NAME_LENGTH 256 +#define NAME_MAX 255 +#define SIGRTMAX 64 +#define SIGRTMAX_ABOVE 100 +enum ftrace_status { + FTRACE_INIT = 0, + FTRACE_KRETPROBE_REGISTER = 1, + FTRACE_REGISTER = 2, + FTRACE_UNREGISTERING = 3, + FTRACE_UNREGISTE_STARTED = 4, + FTRACE_UNREGISTED = 5, + FTRACE_REGISTER_FAILED = 6 +}; + +/* malloc 7 more, to stat max/min .etc */ +enum data_stat_type { + DATA_MAX=0, + DATA_MIN, + DATA_TOTAL, + DATA_AVE, + DATA_P90, + DATA_P95, + DATA_P99, + DATA_NR +}; + +struct func_latency { + unsigned long nr; + long num; + unsigned long latency; + unsigned long block_latency; + char func[NAME_LENGTH]; +}; +extern int record_data(int cycle); diff --git a/ops/os_stat/os_stat_user/main_scene.c b/ops/os_stat/os_stat_user/main_scene.c new file mode 100644 index 0000000000000000000000000000000000000000..96841fae9c0967cd27b28775fc9c71266ab115bf --- /dev/null +++ b/ops/os_stat/os_stat_user/main_scene.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#include +#include "main.h" + +int enable_slub_sk_adjust(void) +{ + char buf[4] = {'1'}, ret, fd; + + fd = open("/proc/sys/os_aware/enable_slub_debug", O_RDWR); + if (fd < 0) { + printf("open eenable_slub_debug rror:%d\n", fd); + return fd; + } + + + ret = write(fd, buf, sizeof(*buf)); + if ( ret <= 0) { + printf("enable error:%d\n", ret); + goto out; + } + + lseek(fd, 0, SEEK_SET); + + /* record kernel log into file */ + record_data(1); + + buf[0] = '0'; + ret = write(fd, buf, sizeof(*buf)); + if ( ret <= 0) + printf("enable error:%d\n", ret); + +out: + close(fd); + return ret; +} + +int main(int argc,char *argv[]) +{ + int opt; + int print = 0, func_num = 0; + unsigned long delay = 5; + char *string = "0:1:2:3:4:5:6:7:8:9:f:n:i:t:b:h:p:x"; + struct option longopts[] = { + {"sc", 1, NULL, 'a'}, + {"10", 1, NULL, 'a'}, + {"11", 1, NULL, 'g'}, + {"12", 1, NULL, 'c'}, + {"13", 1, NULL, 'd'}, + {"14", 1, NULL, 'e'}, + {"15", 1, NULL, 'j'}, + {"16", 1, NULL, 'k'}, + {"17", 1, NULL, 'l'}, + {"18", 1, NULL, 'm'}, + {"19", 1, NULL, 'o'}, + {"20", 1, NULL, 'q'}, + {0, 0, 0, 0}, + }; + + while ((opt = getopt_long(argc, argv, string, longopts, NULL))!= -1) { + switch (opt) { + case 'a': + print = atoi(optarg); + case 'n': + func_num = atoi(optarg); + break; + case 't': + delay = atoi(optarg); + break; + } + } + + if (print == (SIGRTMAX + 7)) + enable_slub_sk_adjust(); +} diff --git a/ops/os_stat/os_stat_user/print_func_path.py b/ops/os_stat/os_stat_user/print_func_path.py new file mode 100755 index 0000000000000000000000000000000000000000..3d86dd25fdc797d6ff92fc4d852bb15f57c63afa --- /dev/null +++ b/ops/os_stat/os_stat_user/print_func_path.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +import sys + + +def parse_conf(_f, upper_level, num): + _ret = 0 + cycle = 0 + last_idx = 0 + with open(_f, 'r') as _fd: + for _l in _fd.readlines(): + _ll = _l.strip() + + # skip null lines + if not _ll: + continue + + # skip comment lines + if _ll.find("#") == 0: + continue + + # get kv + eidx = _ll.find("(") + if eidx == -1: + print("WARNNING: not = find in " + _ll); + continue + level = upper_level + tmpidx = _ll.find("+-") + if tmpidx == -1: + tmpidx = _ll.find("\-") + if tmpidx != -1 and eidx != -1: + lastidx = tmpidx + level = tmpidx + key = _ll[0:tmpidx].strip() + val = _ll[0:].strip() + count = 0; + for i in key: + if i == "|": + count += 1 + level = count + else: + if last_idx == tmpidx: + last_idx = 0 + level = tmpidx + key = _ll[0:tmpidx].strip() + val = _ll[0:].strip() + count = 0; + for i in key: + if i == "|": + count += 1 + level = count + if last_idx != 0: + level += 1 + if level >= int(upper_level) + 2: + continue + if level <= int(upper_level): + break + + cycle += 1 + if cycle < int(num): + continue + print(val) + break + + return _ret + +if __name__ == "__main__": + _f1 = sys.argv[1]; + level = sys.argv[2]; + num = sys.argv[3]; + + _kv1 = parse_conf(_f1, level, num); diff --git a/ops/os_stat/os_stat_user/stat.sh b/ops/os_stat/os_stat_user/stat.sh new file mode 100755 index 0000000000000000000000000000000000000000..506d06497ad25e287bdcab3d07645c3111cd59e5 --- /dev/null +++ b/ops/os_stat/os_stat_user/stat.sh @@ -0,0 +1,504 @@ +#!/bin/sh +function func() +{ + ./get_subfunc_level_name.py $5 $1 $2 $3 $4 +} +function level() +{ + ./get_func_level.py func_tree.txt +} +function num() +{ + ./get_data.py data.txt $1 +} +function latency() +{ + ./get_latency.py data.txt $1 +} +function get_func() +{ + ./get_pointer_func.py data.txt +} +function find_function_path() +{ + find $1 -name "*.[ch]" -exec grep -E -w $2 {} +| more | grep \( > function_path.txt + ./get_function_subpath.py function_path.txt $2 +} +function match_func_tree() +{ + ./get_func_name.py $1 $2 +} +function cflow_func() +{ + tree=$(match_func_tree func_tree.txt $2) + if [ ! -z "$tree" ] ; then + return + fi + #modify kernel source some symbos,such as __ change into _aa, -> change into bb, + #only for cflow can see these functions, this work is too long, so only first time need it. + #after first time, can skip this step + if [ $3 -eq 1 ]; then + ./change.py $1/ + fi + + test_func=$2 + path=$(find_function_path $1 $test_func) + depth=3 + + if [ $4 == 1 ]; then + depth=8 + fi + + if [ ! -z "$path" ] || [ ! -n "$path" ] ; then + cflow -T -d $depth -i _ -m $test_func $path/*.c $path/*.h $1/include/*.h $1/include/*/*.h -o func_tree_xx.txt 2> clow_err.txt + elif [ $# -eq 2 ]; then + cflow -T -d $depth -i _ -m $test_func $1/fs/*/*.c $1/mm/*.c $1/*/*.h $1/*/*/*.h -o func_tree_xx.txt 2> clow_err.txt + else : + cflow -T -d $depth -i _ -m vfs_read $1/fs/*.c $1/include/linux/*.h $1/fs/ext4/*.c -o func_tree_xx.txt 2> clow_err.txt + fi + ./change_resume.py func_tree_xx.txt func_tree.txt +} + +check_cflow() { + cflow_version=$(cflow --version 2>&1) + exit_code=$? + + if [ $exit_code -ne 0 ]; then + echo "Error: cflow --version returned a non-zero exit code: $exit_code" >&2 + exit 1 + fi + + echo "cflow version: $cflow_version" + return 0 +} + +function strstr() +{ + echo $1 | grep $2 +} + +function do_work() +{ + print_first=0 + i=0 + k=0 + m=0 + Null_i=0 + func_level=-1 + func_index=0 + cur_num=() + cur_latency={} + total_latency={} + total_start_num={} + cur_level_stat=() + cur_index={} + max_array_index=1000000000 + cur_real_index={} + cur_func={} + tmp_func={} + last_level=1000000000 + cur_level=1000000000 + start_num=0 + parent_func={} + parent_func[0]="tmp" + parent_func[1]="tmp" + level_first=0 + start_level=1 + null_func={} + null_index={} + null_start={} + last_null_index=0 + total_last_level=0 + null=0 + pointer_last_cflow=0 + tree_file=func_tree.txt + last_sub={} + sub_func_level=1 + old_real_index=-1 + find_again=-1 + + found=$(match_func_tree func_tree.txt $7) + if [ -z "$found" ] ; then + return + fi + + while true; + do + #show the whole path + #./checkconfig_origngrap.py func_tree.txt + #./print_func_path.py $tree_file $func_level $func_index + + #get functionn name + res=$(func $func_level $func_index $start_num 1 $tree_file) + real_index=$(func $func_level $func_index $start_num 2 $tree_file) + if [ -z "$real_index" ]; then + real_index=$func_index + fi + #echo $res : "level:" $func_level "index:" $func_index "real:" $real_index "start:" $start_num + + if [[ ! -z "$res" && $old_real_index == $real_index ]]; then + func_index=$(($func_index+1)) + find_again=1 + continue + fi + if [[ $find_again == 1 && -z "$res" ]]; then + func_index=$(($func_index-1)) + func_level=$(($func_level+1)) + find_again=-2 + continue + fi + if [[ $find_again == -2 && -z "$res" ]]; then + func_level=$(($func_level-1)) + fi + find_again=-1 + if [ ! -z "$res" ]; then + sub=$(strstr $res "schedule") + if [[ $sub = "schedule" ]]; then + func_index=$(($func_index+1)) + continue + fi + sub=$(strstr $res "cond_resched") + if [[ $sub = "cond_resched" ]]; then + func_index=$(($func_index+1)) + continue + fi + fi + old_real_index=$real_index + if [[ -z "$res" && $5 == 1 && ! -z "$tmp_func" ]]; then + #hook function, catch data, print to data.txt + echo current:${tmp_func[@]} + ./os_stat_blongm ${tmp_func[@]} -n $m -i $print_first -t $2 -b 0 -h $4 --ptr 0 -p $6 -x &> data.txt + unset tmp_func + #cat data.txt + + #sub=$(strstr $res "write_iter") + #./checkconfig_origngrap.py data.txt + if [ $last_null_index -eq 0 ]; then + Null_i=0 + fi + while true; + do + tmp_index=${cur_index[$i - $m]} + #echo $i, $m $tmp_index ${cur_func[$i - $m + $k]}, $k + + #get the data + tmp_num=$(num $k) + #echo 000000000000000 $tmp_num, $k + if [ ! -n "$tmp_num" ] || [ $tmp_num -eq 0 ] && [ $last_null_index -eq 0 ]; then + null_func[$Null_i]=${cur_func[$i - $m + $k]} + null_index[$Null_i]=$((${cur_index[$i - $m + $k]} - 1)) + null_start[$Null_i]=${total_start_num[$i - $m + $k]} + #echo "save null:" $Null_i, ${null_index[$Null_i]}, ${null_start[$Null_i]}, ${total_start_num[$i - $m + $k]}, ${null_func[$Null_i]} + Null_i=$(($Null_i+1)) + k=$(($k+1)) + if [ $k -ge $m ];then + break + fi + continue + fi + #record data + cur_num[$i - $m + $k]=$tmp_num + tmp_latency=$(latency $k) + cur_latency[$i - $m + $k]=$tmp_latency + if [ ! -z "$tmp_num" ]; then + total_latency[$i - $m + $k]=`expr $tmp_latency \* $tmp_num` + else : + total_latency[$i - $m + $k]=0 + fi + #echo 000000000000001 ${cur_pointer_func[$i - $m + $k]}, $tmp_num, $tmp_latency, $i - $m + $k, $k + k=$(($k+1)) + if [ $k -ge $m ];then + break + fi + done + m=0 + k=0 + + for(( j=0; j<${#cur_num[@]}; j++ )) do + if [ ! -z "${cur_num[j]}" ] && [ ${cur_num[j]} != 0 ]; then + tmp_index=${cur_index[j]} + if [ ! -z "$start_num" ]; then + tmp_index=($tmp_index - $start_num) + fi + printf "******real index:%10d, index: %10d, level: %2d, num::%8d, latency:%10d ns, total latency:%10d ns, func:%s, origin:%s\n" ${cur_real_index[j]} $tmp_index ${cur_level_stat[j]} ${cur_num[j]} ${cur_latency[j]} ${total_latency[j]} ${cur_func[j]} ${cur_pointer_func[j]} + fi + done + + fi + + if [ -z "$res" ]; then + if [[ $last_level -eq $cur_level && $last_null_index -eq 0 ]] ; then + if [ $max_array_index -lt 1000000000 ]; then + pointer=$(strstr ${cur_func[$max_array_index]} "\->") + if [ -z "$pointer" ]; then + pointer=$(strstr ${cur_func[$max_array_index]} "INDIRECT_CALL_INET") + fi + #echo $max_array_index ":" $pointer + depth=8 + tree_file=sub_func_tree.txt + sub_func=${cur_pointer_func[$max_array_index]} + if [ $3 -eq 0 ]; then + depth=3 + tree_file=sub_func_tree_$sub_func_level.txt + sub_func_level=$(($sub_func_level+1)) + if [ -z "$pointer" ] && [ "$last_sub" == "$sub_func" ]; then + break + fi + #echo "123 stat finish", $last_sub, $sub_func + fi + touch $tree_file + if [ ! -z "$pointer" ] || [ $3 -eq 0 ]; then + pointer_last_cflow=${#cur_num[@]}; + tree=$(match_func_tree $tree_file $sub_func) + last_sub=$sub_func + if [ -z "$tree" ] ; then + old_real_index=-1 + path=$(find_function_path $1 $sub_func) + echo 1-----cflow next level:$1, $sub_func, $path--------- + if [ ! -z "$1" ] && [ ! -z "$path" ]; then + cflow -T -d $depth -i _ -m $sub_func $path/*.c $1/include/linux/*.h -o func_tree_xx.txt 2> clow_err.txt + else : + cflow -T -d $depth -i _ -m $sub_func $1/*/*.c $1/*/*/*.c $1/include/linux/*.h -o func_tree_xx.txt 2> clow_err.txt + fi + ./change_resume.py func_tree_xx.txt $tree_file + found=$(match_func_tree $tree_file $sub_func) + if [ -z "$found" ] ; then + return + fi + fi + func_level=-1 + func_index=0 + start_num=0 + Null_i=0 + continue + fi + fi + echo "stat finish" + break + fi + max_latency=0 + max_index=0 + max_level=0 + total_tmp=0 + start_num=0 + for(( j=$(($pointer_last_cflow+1)); j<${#cur_num[@]}; j++ )) do + if [ -z "${cur_num[j]}" ] || [ ${cur_num[j]} -eq 0 ]; then + continue + fi + + if [ ${cur_level_stat[j]} -lt $cur_level ]; then + continue + fi + total_tmp=$(($total_tmp + ${total_latency[j]})) + if [ $max_latency -lt ${total_latency[j]} ]; then + max_latency=${total_latency[j]} + max_index=${cur_index[j]} + max_array_index=$j + max_start=${total_start_num[j]} + max_func=${cur_func[j]} + fi + done + + #some functions can't be hooked, it's necessary to check these function's latency, + total_double=$(($total_tmp * 20)) + if [[ $last_null_index -gt 0 || $total_double -lt $total_last_level || $total_double -eq 0 ]] && [ $Null_i -gt 1 ]; then + if [ $last_null_index -eq 0 ]; then + last_null_index=$Null_i; + fi + last_null_index=$(($last_null_index - 1)) + max_index=${null_index[$last_null_index]} + max_latency=0 + start_num=${null_start[$last_null_index]} + parent_func[0]=${null_func[$last_null_index]} + null=1 + func_level=$(($cur_level-1)) + + elif [ $max_latency -ne 0 ]; then + total_last_level=$max_latency + start_level=1 + start_num=$(($max_start + 1)) + if [ ! -z "$max_func" ]; then + parent_func[1]=$max_func + fi + fi + + #recored parent function to find pointer function by hook parent, such as hook parent, and give the pointer + #to kernel, then %pF to print origin function to pointer + res=$(func $func_level $(($max_index)) $start_num 1 $tree_file) + + res=$(func $func_level $(($max_index)) $start_num 2 $tree_file) + + if [ ! -z "$res" ]; then + start_num=$(($res)) + fi + func_index=$(($res)) + func_level=$(($func_level+1)) + last_level=$func_level + level_first=1 + continue + if [ $max_latency -ne 0 ]; then + Null_i=0 + #total_last_level=$max_latency + + pointer=$(strstr ${cur_func[$max_array_index]} "\->") + if [ -z "$pointer" ]; then + pointer=$(strstr ${cur_func[$max_array_index]} "INDIRECT_CALL_INET") + fi + if [[ $max_array_index -lt 1000000000 && ! -z "$pointer" ]]; then + #echo $max_array_index ":" $pointer ":" ${cur_func[$max_array_index]} + echo 2-----cflow next level:$1, $sub_func--------- + cflow -T -d 8 -i _ -m ${cur_pointer_func[$max_array_index]} /data/tkernel4/*.c /data/tkernel4/*/*.c /data/tkernel4/*/*/*.c /data/tkernel4/include/linux/*.h -o func_tree_xx.txt 2> clow_err.txt + ./change_resume.py func_tree_xx.txt func_tree.txt + found=$(match_func_tree func_tree.txt ${cur_pointer_func[$max_array_index]}) + if [ -z "$found" ] ; then + return + fi + func_level=-1 + func_index=0 + start_num=0 + continue + fi + fi + #printf "total lantecy: %10d, index: %4d, parent index:%4d, level:%2d, this:%10d\n" $max_latency $func_index $max_index $func_level $total_last_level + continue + fi + + #hook function, catch data, print to data.txt + if [ $5 == 1 ]; then + pointer=$(strstr $res "\->") + if [ -z "$pointer" ]; then + pointer=$(strstr $res "INDIRECT_CALL_INET") + fi + cur_pointer_func[i]=$res + #only pointer need + #echo ------pointer:$pointer, $res + if [ ! -z "$pointer" ]; then + trace_func={} + trace_func[0]="-0 $res" + trace_func[1]="-1 ${parent_func[0]}" + trace_func[2]=" -2 ${parent_func[1]}" + echo --- pointer: ${trace_func[@]} --- + ./os_stat_blongm ${trace_func[@]} -n 1 -i $print_first -t $2 -b 0 -h $4 --ptr 1 -p $6 -x &> data.txt + cur_pointer_func[i]=$(get_func) + fi + tmp_func[m]="-$m ${cur_pointer_func[i]}" + if [ -z "${cur_pointer_func[i]}" ]; then + tmp_func[m]="-$m $res" + fi + #long opt, need -- + if [ $m -gt 9 ]; then + tmp_func[m]="-${tmp_func[m]}" + fi + #echo 000000000000001 $res, ${tmp_func[m]}, $m, ${cur_pointer_func[i]}, ${parent_func[0]}, ${parent_func[1]} + cur_num[$i]=0 + m=$(($m+1)) + else : + printf "%-40s %-40s %-40s\n" current:$res\(\), parent:${parent_func[0]}\(\), parent:${parent_func[1]}\(\) + ./os_stat_blongm -0 $res -1 ${parent_func[0]} -2 ${parent_func[1]} -n 1 -i $print_first -t $2 -b 0 -h $4 --ptr 2 -p $6 -x &> data.txt + ./checkconfig_origngrap.py data.txt + if [ $print_first -eq 1 ]; then + print_first=0 + cat data.txt + continue + fi + old_index=$func_index + fi + + #输出文件内容 + cur_level=$(($func_level+1)) + if [ $level_first -eq 0 ]; then + func_index=$(($func_index+1)) + fi + level_first=0 + if [ $5 == 0 ]; then + #get the data + tmp_num=$(num 0) + #echo 000000000000000 $tmp_num + if [ ! -n "$tmp_num" ] || [ $tmp_num -eq 0 ] ; then + null_func[$Null_i]=$res + null_index[$Null_i]=$old_index + null_start[$Null_i]=$start_num + Null_i=$(($Null_i+1)) + continue + fi + #record data + cur_num[i]=$tmp_num + tmp_latency=$(latency 0) + cur_latency[i]=$tmp_latency + #echo 000000000000000 $tmp_latency + if [ ! -z "$tmp_num" ]; then + total_latency[i]=`expr $tmp_latency \* $tmp_num` + else : + total_latency[i]=0 + fi + cur_pointer_func[i]=$(get_func) + fi + total_start_num[i]=$start_num + cur_level_stat[i]=$cur_level + cur_index[i]=$((func_index)) + cur_real_index[i]=$real_index + cur_func[i]=$res + i=$(($i+1)) + + if [ $start_level == 1 ]; then + parent_func[0]=$res + if [ ! -z "${cur_pointer_func[i]}" ]; then + parent_func[0]=${cur_pointer_func[i]} + fi + fi + start_level=0 + max_array_index=1000000000 + + if [ $5 == 0 ]; then + for(( j=0; j<${#cur_num[@]}; j++ )) do + if [ ${cur_num[j]} != 0 ]; then + tmp_index=${cur_index[j]} + if [ ! -z "$start_num" ]; then + tmp_index=($tmp_index - $start_num) + fi + printf "******real index:%10d, index: %10d, level: %2d, num::%8d, latency:%10d ns, total latency:%10d ns, func:%s, origin:%s\n" ${cur_real_index[j]} $tmp_index ${cur_level_stat[j]} ${cur_num[j]} ${cur_latency[j]} ${total_latency[j]} ${cur_func[j]} ${cur_pointer_func[j]} + fi + done + fi + + continue + done +} + +#main +function main() +{ + #check paremter counts + if [ $# -lt 2 ]; then + echo "[Error]: t-ops os_stat linux_dir/"which afford source code" function/"which scan from this function" flags/"0:hot path, 1: one function" hot/"if first scan hot path, hot=1, other case hot=0" \ + such as: t-ops os_stat /data/tkernel4 vfs_read 0 0" + return + fi + + #check user tools + if [ ! -f "./os_stat_blongm" ]; then + echo "user stat file: os_stat_blongm is not exist" + fi + + #scan function list + check_cflow + + #测试前,先卸载驱动,然后安装,防止程序强制退出时,ftrace部分没有卸载hook + rmmod os_aware + if [ $# -eq 0 ]; then + echo "need install os stat module: ./stat.sh os_aware_xxx.ko" + return + fi + insmod $1 + echo "install os stat kernel module:" $1 "successfully" + + #scan function list + cflow_func $2 $3 $4 $6 + + #start to work + do_work $2 $5 $6 $7 $8 $9 $3 +} + +main $* + + diff --git a/ops/os_stat/os_stat_user/test.c b/ops/os_stat/os_stat_user/test.c new file mode 100644 index 0000000000000000000000000000000000000000..79d2e98eea926cdde64c9213ca31bfd8eb47b9f9 --- /dev/null +++ b/ops/os_stat/os_stat_user/test.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "main.h" + +/* + * work with main.c:main, main.c initializes module and context, is main line + * data_only.c:main: here gets data only, which not to initialize context + * and data_only.c:main could work with main.c:main + */ +int main(int argc,char *argv[]) +{ + int file_fd, file_fd_total, fd, ret; + struct func_latency data; + + file_fd = open("/data/1.txt", + O_RDWR | O_CREAT, S_IRWXU | S_IRGRP | S_IROTH); + if (fd < 0) { + printf("open /usr/lib//tencentos-tools/1.txt error:%d\n", + fd); + return -EINVAL; + } + + sleep(2); + printf("read data start\n"); + /* get data from kernel */ + ret = read(fd, &data, sizeof(struct func_latency)); + printf("read data end\n"); + + close(file_fd); + return 0; +} diff --git a/ops/show/ops-help b/ops/show/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..6983992f76873488fff569831c1f4caaec6a8900 --- /dev/null +++ b/ops/show/ops-help @@ -0,0 +1 @@ +show: show all sub command diff --git a/ops/show/ops-run b/ops/show/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..251392a33583c147938ff7ac45530c7de5755c27 --- /dev/null +++ b/ops/show/ops-run @@ -0,0 +1,23 @@ +#!/bin/bash +# -*- coding: utf-8 -*- +############################################### +# Version : V1.0 +# Auther : aurelianliu@tencent.com +# Organization : NULL +############################################### + +function strstr() +{ + echo $1 | grep $2 +} + +function main() +{ + #work + printf "\033[31m-------------find work: show all sub command\033[0m\n" + subdir=$(dirname $0) + dir=${subdir%ops*} + #------ show all commnad ------ + $dir/oc-ops show_comm +} +main $* diff --git a/ops/system/checkkernel/check_release_kernel.sh b/ops/system/checkkernel/check_release_kernel.sh new file mode 100755 index 0000000000000000000000000000000000000000..aa0c993802ed78a70d278370a39aab6e9af41e80 --- /dev/null +++ b/ops/system/checkkernel/check_release_kernel.sh @@ -0,0 +1,208 @@ +#!/bin/bash +# +# 功能:检查某个内核版本号是否是TencentOS Server正式发布的内核版本 +# 作者:Yongliang Gao + +WORKDIR=$(dirname `readlink -f $0`) + +WHILELIST_FILE=$WORKDIR/release_whitelist.txt +BLACKLIST_FILE=$WORKDIR/release_blacklist.txt + +VERSION="1.0" +DEBUG_MODE=0 +KERNEL_VERSION=$(uname -r) +TMANAGER_URL="https://tmanager.woa.com/#/subsystem/software/kernel" + +RED="\033[0;31m" +GREEN="\033[0;32m" +YELLOW="\033[0;33m" +BLUE="\033[0;34m" +NO_COLOR="\033[0m" + +function pr_err() { + printf "${RED}$1${NO_COLOR}\n" +} + +function pr_info() { + printf "${GREEN}$1${NO_COLOR}\n" +} + +function pr_warn() { + printf "${YELLOW}$1${NO_COLOR}\n" +} + +function pr_debug() { + if [ "$DEBUG_MODE" -eq 1 ]; then + printf "${BLUE}$1${NO_COLOR}\n" + fi +} + +function show_version() { + echo "Version: ${VERSION}" +} + +function show_help() { + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " -k KERNEL_VERSION Specify the kernel version to check." + echo " If not provided, use the current kernel version." + echo " -d Enable debug mode." + echo " -h Show this help message and exit." + echo " -v Show version information and exit." +} + +function check_unrelease() { + local version=$1 + + if [[ $version =~ (rc|prerelease|git|kasan|virt|test) ]]; then + pr_debug "非正式内核版本:包含非正式内核的关键字" + return 1 + fi + + if [[ ! $version =~ ^(3\.10|4\.14|5\.4|6\.6) ]]; then + pr_debug "非正式内核版本:不是3.10或4.14或5.4或6.6开头的版本" + return 1 + fi + + if [[ $version =~ ^3\.10 ]] && [[ ! $version =~ tlinux2 ]]; then + pr_debug "非正式内核版本:3.10开头,但不包含tlinux2字段" + return 1 + fi + + if [[ $version =~ ^4\.14 ]] && [[ ! $version =~ tlinux3 ]]; then + pr_debug "非正式内核版本:4.14开头,但不包含tlinux3字段" + return 1 + fi + + if [[ $version =~ ^5\.4 ]]; then + if [[ $version =~ plus$ ]]; then + if [[ ! $version =~ -19- ]]; then + pr_debug "非正式内核版本:5.4开头且plus结尾,不包含-19-" + return 1 + fi + elif [[ ! $version =~ tlinux4 ]]; then + pr_debug "非正式内核版本:5.4开头但不是plus结尾的,不包含tlinux4字段" + return 1 + fi + fi + + return 0 +} + +function check_release_whitelist() { + local version=$1 + + while IFS= read -r line + do + if [[ -z $line ]] || [[ $line =~ ^# ]]; then + continue + fi + + if [[ $version == $line ]]; then + pr_debug "正式内核版本,白名单匹配" + return 1 + fi + done < $WHILELIST_FILE + + return 0 +} + +function check_release_blacklist() { + local version=$1 + + while IFS= read -r line + do + if [[ -z $line ]] || [[ $line =~ ^# ]]; then + continue + fi + + if [ "$version" = "$line" ]; then + pr_debug "非正式内核版本,黑名单匹配" + return 1 + fi + done < $BLACKLIST_FILE + + return 0 +} + +function check_tmanager() { + local version=$1 + + pr_warn "本地无法判断${version}是否是TencentOS Server正式发布的内核版本" + pr_warn "请在Tmanager平台上搜索:${TMANAGER_URL}" + return 0 +} + +function parse_args() { + while getopts ":k:dhv" opt; do + case $opt in + k) + KERNEL_VERSION=$OPTARG + ;; + d) + DEBUG_MODE=1 + ;; + h) + show_help + exit 0 + ;; + v) + show_version + exit 0 + ;; + \?) + pr_err "Invalid option: -$OPTARG" + show_help + exit 1 + ;; + :) + pr_err "Option -$OPTARG requires an argument." + show_help + exit 1 + ;; + esac + done +} + +function main() { + parse_args $@ + + if [ ! -f "$WHILELIST_FILE" ]; then + pr_err "内核版本白名单文件$WHILELIST_FILE不存在" + return 1 + fi + + if [ ! -f "$BLACKLIST_FILE" ]; then + pr_err "内核版本黑名单文件$BLACKLIST_FILE不存在" + return 1 + fi + + # 检查明确不是正式内核版本的情况 + check_unrelease $KERNEL_VERSION + if [ $? -ne 0 ]; then + pr_warn "${KERNEL_VERSION}不是TencentOS Server正式发布的内核版本" + return 1 + fi + + # 检查正式内核版本白名单 + check_release_whitelist $KERNEL_VERSION + if [ $? -ne 0 ]; then + pr_info "${KERNEL_VERSION}是TencentOS Server正式发布的内核版本" + return 0 + fi + + # 检查正式内核版本黑名单 + check_release_blacklist $KERNEL_VERSION + if [ $? -ne 0 ]; then + pr_warn "${KERNEL_VERSION}不是TencentOS Server正式发布的内核版本" + return 0 + fi + + # 使用tmanager查询 + check_tmanager $KERNEL_VERSION + return 0 +} + +main $@ +exit $? \ No newline at end of file diff --git a/ops/system/checkkernel/ops-help b/ops/system/checkkernel/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..030621d2c13baf47a04b765aad864f6822c055be --- /dev/null +++ b/ops/system/checkkernel/ops-help @@ -0,0 +1 @@ +Check release kernel diff --git a/ops/system/checkkernel/ops-run b/ops/system/checkkernel/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..345aa159836db96176f2ff1e8e1e9f709c2a87f5 --- /dev/null +++ b/ops/system/checkkernel/ops-run @@ -0,0 +1 @@ +check_release_kernel.sh \ No newline at end of file diff --git a/ops/system/checkkernel/release_blacklist.txt b/ops/system/checkkernel/release_blacklist.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c1cbde61ff26ff714f046461c05b6e111cd6ae8 --- /dev/null +++ b/ops/system/checkkernel/release_blacklist.txt @@ -0,0 +1,10 @@ +# +# 内核版本黑名单文件 +# +# =============== blacklist start ============== +5.4.119-1-tlinux4-0009-eks +5.4.119-1-tlinux4-0009-public-eks +5.4.119-19-tlinux4-0010 +5.4.119-1-tlinux4-2ab3fe8a1334-timens +5.4.119-1-tlinux4_toa_TCSOS2.0-nosign +# =============== blacklist end ============== diff --git a/ops/system/checkkernel/release_whitelist.txt b/ops/system/checkkernel/release_whitelist.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f77632056594aadebb4c6744af4641ad62a0e9a --- /dev/null +++ b/ops/system/checkkernel/release_whitelist.txt @@ -0,0 +1,203 @@ +# +# 内核版本白名单文件 +# + +# =============== TK2 start ============== +# TK2内核 +3.10.90-1-tlinux2-0031.tl1 +3.10.94-1-tlinux2-0035.tl1 +3.10.94-1-tlinux2-0036.tl1 +3.10.101-1-tlinux2-0037.tl1 +3.10.101-1-tlinux2-0039.tl1 +3.10.102-1-tlinux2-0040.tl1 +3.10.104-1-tlinux2-0041.tl1 +3.10.105-1-tlinux2-0042.tl1 + +3.10.90-1-tlinux2-0031.tl2 +3.10.94-1-tlinux2-0035.tl2 +3.10.94-1-tlinux2-0036.tl2 +3.10.101-1-tlinux2-0037.tl2 +3.10.101-1-tlinux2-0039.tl2 +3.10.102-1-tlinux2-0040.tl2 +3.10.104-1-tlinux2-0041.tl2 +3.10.105-1-tlinux2-0042.tl2 + +3.10.105-1-tlinux2-0042 +3.10.106-1-tlinux2-0044 +3.10.107-1-tlinux2-0045 +3.10.107-1-tlinux2-0046 +3.10.107-1-tlinux2-0047 +3.10.107-1-tlinux2-0048 +3.10.107-1-tlinux2-0049 +3.10.107-1-tlinux2-0050 +3.10.107-1-tlinux2-0051 +3.10.107-1-tlinux2-0052 +3.10.107-1-tlinux2-0053 +3.10.107-1-tlinux2-0054 +3.10.107-1-tlinux2-0055 +3.10.107-1-tlinux2-0056 + +# TK2 kvm_guest内核 +3.10.83-1-tlinux2_kvm_guest-0015.tl1 +3.10.94-1-tlinux2_kvm_guest-0019.tl1 +3.10.94-1-tlinux2_kvm_guest-0019.tl2 +3.10.104-1-tlinux2_kvm_guest-0021.tl1 +3.10.104-1-tlinux2_kvm_guest-0021.tl2 +3.10.104-1-tlinux2_kvm_guest-0022.tl2 +3.10.106-1-tlinux2_kvm_guest-0024 + +3.10.107-1-tlinux2_kvm_guest-0045 +3.10.107-1-tlinux2_kvm_guest-0046 +3.10.107-1-tlinux2_kvm_guest-0047 +3.10.107-1-tlinux2_kvm_guest-0048 +3.10.107-1-tlinux2_kvm_guest-0049 +3.10.107-1-tlinux2_kvm_guest-0050 +3.10.107-1-tlinux2_kvm_guest-0051 +3.10.107-1-tlinux2_kvm_guest-0052 +3.10.107-1-tlinux2_kvm_guest-0053 +3.10.107-1-tlinux2_kvm_guest-0054 +3.10.107-1-tlinux2_kvm_guest-0055 +3.10.107-1-tlinux2_kvm_guest-0056 +# =============== TK2 end ================ + +# =============== TK3 start ============== +# TK3内核 +4.14.105-1-tlinux3-0007 +4.14.105-1-tlinux3-0008 +4.14.105-1-tlinux3-0009 +4.14.105-1-tlinux3-0010 +4.14.105-1-tlinux3-0011 +4.14.105-1-tlinux3-0012 +4.14.105-1-tlinux3-0013 +4.14.105-1-tlinux3-0014 +4.14.105-1-tlinux3-0015 +4.14.105-1-tlinux3-0016 +4.14.105-1-tlinux3-0017 +4.14.105-1-tlinux3-0018 +4.14.105-1-tlinux3-0019 +4.14.105-1-tlinux3-0020 +4.14.105-1-tlinux3-0020.1 +4.14.105-1-tlinux3-0020.2 +4.14.105-1-tlinux3-0020.3 +4.14.105-1-tlinux3-0021 +4.14.105-1-tlinux3-0022 +4.14.105-1-tlinux3-0023 +4.14.105-1-tlinux3-0023.1 +# =============== TK3 end ================ + +# =============== TK4 start ============== +# TK4 长期演进版本内核 +5.4.32-1-tlinux4-0001 +5.4.87-1-tlinux4-0002 +5.4.109-1-tlinux4-0003 +5.4.119-1-tlinux4-0004 +5.4.119-1-tlinux4-0005 +5.4.119-1-tlinux4-0006 +5.4.119-1-tlinux4-0007 +5.4.119-1-tlinux4-0008 +5.4.119-1-tlinux4-0009 +5.4.119-1-tlinux4-0009.1 +5.4.119-1-tlinux4-0009.2 +5.4.119-1-tlinux4-0009.3 +5.4.119-1-tlinux4-0010 +5.4.119-1-tlinux4-0010.1 +5.4.119-1-tlinux4-0010.2 +5.4.119-1-tlinux4-0010.3 +5.4.203-1-tlinux4-0011 +5.4.203-1-tlinux4-0011.1 +5.4.203-1-tlinux4-0011.2 +5.4.203-1-tlinux4-0011.3 +5.4.241-1-tlinux4-0017 +5.4.241-1-tlinux4-0017.1 +5.4.241-1-tlinux4-0017.2 +5.4.241-1-tlinux4-0017.3 +5.4.241-1-tlinux4-0017.4 +5.4.241-1-tlinux4-0017.5 +5.4.241-1-tlinux4-0017.6 +5.4.241-1-tlinux4-0017.7 +5.4.241-1-tlinux4-0017.8 +5.4.241-1-tlinux4-0017.9 +5.4.241-1-tlinux4-0017.10 +5.4.241-1-tlinux4-0017.11 +5.4.241-1-tlinux4-0017.12 +5.4.241-1-tlinux4-0017.13 +5.4.241-1-tlinux4-0017.14 +5.4.241-1-tlinux4-0017.15 +5.4.241-1-tlinux4-0017.16 +5.4.241-1-tlinux4-0022 + +# TK4 新特性版本内核 +5.4.241-1-tlinux4-0018 +5.4.241-1-tlinux4-0019 +5.4.241-1-tlinux4-0019.feat + +# TK4 EKS版本内核 +5.4.241-1-tlinux4-0017.6-eks.03 +5.4.241-1-tlinux4-0017.10.eks +5.4.241-1-tlinux4-0017.10.eks.1 +5.4.241-1-tlinux4-0017.10.eks.2 +5.4.241-1-tlinux4-0017.10.eks.3 +5.4.241-1-tlinux4-0017.10.eks.4 +5.4.241-1-tlinux4-0017.10.eks.5 +5.4.241-1-tlinux4-0017.10.eks.6 +5.4.241-1-tlinux4-0017.10.eks.7 +5.4.241-1-tlinux4-0017.10.eks.8 +5.4.241-1-tlinux4-0017.10.eks.9 +5.4.241-1-tlinux4-0017.10.eks.10 + +# TK4 public plus内核 +5.4.119-19-0007_plus +5.4.119-19-0008_plus +5.4.119-19-0009_plus +5.4.119-19-0010_plus +5.4.119-19-0011_plus +5.4.119-19-0012_plus +5.4.119-19-0013_plus +5.4.119-19-0015_plus +5.4.203-19-0016_plus +5.4.203-19-0016.2_plus +5.4.203-19-0016.3_plus +5.4.241-19-0017_plus +5.4.241-19-0017.1_plus +5.4.241-19-0017.2_plus +5.4.241-19-0017.3_plus +5.4.241-19-0017.5_plus +5.4.241-19-0017.6_plus +5.4.241-19-0017.7_plus + +# TK4 emr内核 +5.4.203-1-tlinux4-0011.emr.0001 +5.4.203-1-tlinux4-0011.emr.0002 +5.4.203-1-tlinux4-0011.emr.0003 +5.4.203-1-tlinux4-0011.emr.0003.1 +5.4.203-1-tlinux4-0011.emr.0003.2 +5.4.203-1-tlinux4-0011.emr.0003.3 + +# TK4 spr内核 +5.4.203-1-tlinux4-0011.spr.0001 +5.4.203-1-tlinux4-0011.spr.0001.1 +5.4.203-1-tlinux4-0011.spr.0001.2 +5.4.203-1-tlinux4-0011.spr.0001.3 +5.4.203-1-tlinux4-0011.spr.0002 +5.4.203-1-tlinux4-0011.spr.0003 +5.4.203-1-tlinux4-0011.spr.0003.1 + +# =============== TK4 end ================ + +# =============== TK5 start ============== +# TK5最新内核 +6.6.26-1.tl4.x86_64 +6.6.30-5.tl4.x86_64 +6.6.34-9.tl4.x86_64 +6.6.47-12.tl4.x86_64 +6.6.58-15.tl4.x86_64 +6.6.64-18.tl4.x86_64 +6.6.70-24.tl4.x86_64 + +# TK5其他内核 +6.6.30-4.tl4.x86_64 +6.6.26-1.0.tl4.x86_64 +6.6.6-2401.0.1.tl4.x86_64 +6.6.6-2401.0.1.tl4.3.x86_64 +6.6.6-2401.0.1.tl4.4.x86_64 +# =============== TK5 end ================ diff --git a/ops/system/install-kernel-devel-pkg/install-kernel-devel.sh b/ops/system/install-kernel-devel-pkg/install-kernel-devel.sh new file mode 100755 index 0000000000000000000000000000000000000000..bb6cf716e48b7eb924e5a83485098aa474e13c7c --- /dev/null +++ b/ops/system/install-kernel-devel-pkg/install-kernel-devel.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# 获取当前架构(例如 x86_64, aarch64) +arch=$(uname -m) + +# 获取内核版本并移除架构后缀 +kernel_version=$(uname -r | sed "s/\.${arch}$//") + +# 从yum源配置获取系统小版本号 +releasever=$(cat /etc/yum/vars/releasever | sed "s/\.[0-9].*//") + +if [[ $kernel_version =~ ^(.*)-tlinux([0-9]+)-(.*)$ ]]; then + base_version="${BASH_REMATCH[1]}" + tlinux_num="${BASH_REMATCH[2]}" + build_number="${BASH_REMATCH[3]}" + pkg_name="kernel-tlinux${tlinux_num}-devel-${base_version}.${build_number}.tl${releasever}.${arch}" + +elif [[ $kernel_version =~ \.tl[0-9]+$ ]]; then + pkg_name="kernel-devel-${kernel_version}.${arch}" + +else + pkg_name="kernel-devel-${kernel_version}.tl${releasever}.${arch}" +fi + +if rpm -qa | grep -q -w $pkg_name; then + echo "$pkg_name is already existed." + exit 0 +fi + +pkg_url="" +pkg_url=$(repoquery --enablerepo=* --location $pkg_name | grep -w "$pkg_name") +if [ -z $pkg_url ]; then + echo "$pkg_name cannot be found in yum repository." + exit 1 +fi + +if ! rpm -ivh --force $pkg_url; then + echo "Install $pkg_name from $pkg_url failed." + exit 1 +fi + +if rpm -qa | grep -q -w $pkg_name; then + echo "Install $pkg_name successed." + exit 0 +else + echo "Install $pkg_name failed." + exit 1 +fi + diff --git a/ops/system/install-kernel-devel-pkg/ops-help b/ops/system/install-kernel-devel-pkg/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..9eedb06b884679920b0417963523dc2168f4343c --- /dev/null +++ b/ops/system/install-kernel-devel-pkg/ops-help @@ -0,0 +1 @@ +Install the kernel-devel RPM package from yum repository. diff --git a/ops/system/install-kernel-devel-pkg/ops-run b/ops/system/install-kernel-devel-pkg/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..203fbf5ef7b885fddd0458c363a19df25452c415 --- /dev/null +++ b/ops/system/install-kernel-devel-pkg/ops-run @@ -0,0 +1 @@ +install-kernel-devel.sh \ No newline at end of file diff --git a/ops/system/kdump/kdump.conf b/ops/system/kdump/kdump.conf new file mode 100644 index 0000000000000000000000000000000000000000..55604515aacf0ad858b404c0a576c5ab85596a5a --- /dev/null +++ b/ops/system/kdump/kdump.conf @@ -0,0 +1,185 @@ +# This file contains a series of commands to perform (in order) in the kdump +# kernel after a kernel crash in the crash kernel(1st kernel) has happened. +# +# Directives in this file are only applicable to the kdump initramfs, and have +# no effect once the root filesystem is mounted and the normal init scripts are +# processed. +# +# Currently, only one dump target and path can be specified. If the dumping to +# the configured target fails, the failure action which can be configured via +# the "failure_action" directive will be performed. +# +# Supported options: +# +# raw +# - Will dd /proc/vmcore into . +# Use persistent device names for partition devices, +# such as /dev/vg/. +# +# nfs +# - Will mount nfs to , and copy /proc/vmcore to +# //%HOST-%DATE/, supports DNS. +# +# ssh +# - Will save /proc/vmcore to :/%HOST-%DATE/, +# supports DNS. +# NOTE: make sure the user has write permissions on the server. +# +# sshkey +# - Will use the sshkey to do ssh dump. +# Specify the path of the ssh key to use when dumping +# via ssh. The default value is /root/.ssh/kdump_id_rsa. +# +# +# - Will mount -t , and copy +# /proc/vmcore to //%HOST_IP-%DATE/. +# NOTE: can be a device node, label or uuid. +# It's recommended to use persistent device names +# such as /dev/vg/. +# Otherwise it's suggested to use label or uuid. +# +# path +# - "path" represents the file system path in which vmcore +# will be saved. If a dump target is specified in +# kdump.conf, then "path" is relative to the specified +# dump target. +# +# Interpretation of "path" changes a bit if the user didn't +# specify any dump target explicitly in kdump.conf. In this +# case, "path" represents the absolute path from root. The +# dump target and adjusted path are arrived at automatically +# depending on what's mounted in the current system. +# +# Ignored for raw device dumps. If unset, will use the default +# "/var/crash". +# +# core_collector +# - This allows you to specify the command to copy +# the vmcore. The default is makedumpfile, which on +# some architectures can drastically reduce vmcore size. +# See /sbin/makedumpfile --help for a list of options. +# Note that the -i and -g options are not needed here, +# as the initrd will automatically be populated with a +# config file appropriate for the running kernel. +# The default core_collector for raw/ssh dump is: +# "makedumpfile -F -l --message-level 7 -d 31". +# The default core_collector for other targets is: +# "makedumpfile -l --message-level 7 -d 31". +# +# "makedumpfile -F" will create a flattened vmcore. +# You need to use "makedumpfile -R" to rearrange the dump data to +# a normal dumpfile readable with analysis tools. For example: +# "makedumpfile -R vmcore < vmcore.flat". +# +# For core_collector format details, you can refer to +# kexec-kdump-howto.txt or kdump.conf manpage. +# +# kdump_post +# - This directive allows you to run a executable binary +# or script after the vmcore dump process terminates. +# The exit status of the current dump process is fed to +# the executable binary or script as its first argument. +# All files under /etc/kdump/post.d are collectively sorted +# and executed in lexical order, before binary or script +# specified kdump_post parameter is executed. +# +# kdump_pre +# - Works like the "kdump_post" directive, but instead of running +# after the dump process, runs immediately before it. +# Exit status of this binary is interpreted as follows: +# 0 - continue with dump process as usual +# non 0 - run the final action (reboot/poweroff/halt) +# All files under /etc/kdump/pre.d are collectively sorted and +# executed in lexical order, after binary or script specified +# kdump_pre parameter is executed. +# Even if the binary or script in /etc/kdump/pre.d directory +# returns non 0 exit status, the processing is continued. +# +# extra_bins +# - This directive allows you to specify additional binaries or +# shell scripts to be included in the kdump initrd. +# Generally they are useful in conjunction with a kdump_post +# or kdump_pre binary or script which depends on these extra_bins. +# +# extra_modules +# - This directive allows you to specify extra kernel modules +# that you want to be loaded in the kdump initrd. +# Multiple modules can be listed, separated by spaces, and any +# dependent modules will automatically be included. +# +# failure_action +# - Action to perform in case dumping fails. +# reboot: Reboot the system. +# halt: Halt the system. +# poweroff: Power down the system. +# shell: Drop to a bash shell. +# Exiting the shell reboots the system by default, +# or perform "final_action". +# dump_to_rootfs: Dump vmcore to rootfs from initramfs context and +# reboot by default or perform "final_action". +# Useful when non-root dump target is specified. +# The default option is "reboot". +# +# default +# - Same as the "failure_action" directive above, but this directive +# is obsolete and will be removed in the future. +# +# final_action +# - Action to perform in case dumping succeeds. Also performed +# when "shell" or "dump_to_rootfs" failure action finishes. +# Each action is same as the "failure_action" directive above. +# The default is "reboot". +# +# force_rebuild <0 | 1> +# - By default, kdump initrd will only be rebuilt when necessary. +# Specify 1 to force rebuilding kdump initrd every time when kdump +# service starts. +# +# force_no_rebuild <0 | 1> +# - By default, kdump initrd will be rebuilt when necessary. +# Specify 1 to bypass rebuilding of kdump initrd. +# +# force_no_rebuild and force_rebuild options are mutually +# exclusive and they should not be set to 1 simultaneously. +# +# override_resettable <0 | 1> +# - Usually an unresettable block device can't be a dump target. +# Specifying 1 when you want to dump even though the block +# target is unresettable +# By default, it is 0, which will not try dumping destined to fail. +# +# dracut_args +# - Pass extra dracut options when rebuilding kdump initrd. +# +# fence_kdump_args +# - Command line arguments for fence_kdump_send (it can contain +# all valid arguments except hosts to send notification to). +# +# fence_kdump_nodes +# - List of cluster node(s) except localhost, separated by spaces, +# to send fence_kdump notifications to. +# (this option is mandatory to enable fence_kdump). +# + +#raw /dev/vg/lv_kdump +#ext4 /dev/vg/lv_kdump +#ext4 LABEL=/boot +#ext4 UUID=03138356-5e61-4ab3-b58e-27507ac41937 +#nfs my.server.com:/export/tmp +#nfs [2001:db8::1:2:3:4]:/export/tmp +#ssh user@my.server.com +#ssh user@2001:db8::1:2:3:4 +#sshkey /root/.ssh/kdump_id_rsa +path /data/tlinux/crash +core_collector makedumpfile -l --message-level 7 -d 31 +#core_collector scp +#kdump_post /var/crash/scripts/kdump-post.sh +#kdump_pre /var/crash/scripts/kdump-pre.sh +#extra_bins /usr/bin/lftp +#extra_modules gfs2 +#failure_action shell +#force_rebuild 1 +#force_no_rebuild 1 +#dracut_args --omit-drivers "cfg80211 snd" --add-drivers "ext2 ext3" +#fence_kdump_args -p 7410 -f auto -c 0 -i 10 +#fence_kdump_nodes node1 node2 diff --git a/ops/system/kdump/ops-help b/ops/system/kdump/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..c9485397f57dbcb763177b86065235eddc2d7a8d --- /dev/null +++ b/ops/system/kdump/ops-help @@ -0,0 +1 @@ +Check kdump service status diff --git a/ops/system/kdump/ops-run b/ops/system/kdump/ops-run new file mode 100755 index 0000000000000000000000000000000000000000..132bf0b11964ff7138d5cb3352b1bd8cb34fe7c2 --- /dev/null +++ b/ops/system/kdump/ops-run @@ -0,0 +1,182 @@ +#!/bin/bash + +KDUMP_CONFIG_FILE="/etc/kdump.conf" + +# Read from kdump config file stripping all comments +read_strip_comments() +{ + # strip heading spaces, and print any content starting with + # neither space or #, and strip everything after # + sed -n -e "s/^\s*\([^# \t][^#]\+\).*/\1/gp" $1 +} + +strip_comments() +{ + echo $@ | sed -e 's/\(.*\)#.*/\1/' +} + +# get_option_value +# retrieves value of option defined in kdump.conf +get_option_value() { + strip_comments `grep "^$1[[:space:]]\+" /etc/kdump.conf | tail -1 | cut -d\ -f2-` +} + +check_failure_action_config() +{ + local default_option + local failure_action + local option="failure_action" + + default_option=$(awk '$1 ~ /^default$/ {print $2;}' $KDUMP_CONFIG_FILE) + failure_action=$(awk '$1 ~ /^failure_action$/ {print $2;}' $KDUMP_CONFIG_FILE) + + if [ -z "$failure_action" -a -z "$default_option" ]; then + return 0 + elif [ -n "$failure_action" -a -n "$default_option" ]; then + echo "Cannot specify 'failure_action' and 'default' option together" + return 1 + fi + + if [ -n "$default_option" ]; then + option="default" + failure_action="$default_option" + fi + + case "$failure_action" in + reboot|halt|poweroff|shell|dump_to_rootfs) + return 0 + ;; + *) + echo $"Usage kdump.conf: $option {reboot|halt|poweroff|shell|dump_to_rootfs}" + return 1 + esac +} + +check_final_action_config() +{ + local final_action + + final_action=$(awk '$1 ~ /^final_action$/ {print $2;}' $KDUMP_CONFIG_FILE) + if [ -z "$final_action" ]; then + return 0 + else + case "$final_action" in + reboot|halt|poweroff) + return 0 + ;; + *) + echo $"Usage kdump.conf: final_action {reboot|halt|poweroff}" + return 1 + esac + fi +} + +check_fence_kdump_config() +{ + local hostname=`hostname` + local ipaddrs=`hostname -I` + local nodes=$(get_option_value "fence_kdump_nodes") + + for node in $nodes; do + if [ "$node" = "$hostname" ]; then + echo "Option fence_kdump_nodes cannot contain $hostname" + return 1 + fi + # node can be ipaddr + echo $ipaddrs | grep $node > /dev/null + if [ $? -eq 0 ]; then + echo "Option fence_kdump_nodes cannot contain $node" + return 1 + fi + done + + return 0 +} + +check_kdump_config() +{ + local nr + + nr=$(awk 'BEGIN{cnt=0} /^raw|^ssh[[:blank:]]|^nfs|^ext[234]|^xfs|^btrfs|^minix|^dracut_args .*\-\-mount/{cnt++} END{print cnt}' $KDUMP_CONFIG_FILE) + [ $nr -gt 1 ] && { + echo "More than one dump targets specified." + return 1 + } + + nr=$(grep "^dracut_args .*\-\-mount" $KDUMP_CONFIG_FILE | grep -o "\-\-mount" | wc -l) + [ $nr -gt 1 ] && { + echo "Multiple mount targets specified in one \"dracut_args\"." + return 1 + } + + # Check if we have any leading spaces (or tabs) before the + # variable name in the kdump conf file + if grep -E -q '^[[:blank:]]+[a-z]' $KDUMP_CONFIG_FILE; then + echo "No whitespaces are allowed before a kdump option name in $KDUMP_CONFIG_FILE" + return 1 + fi + + while read config_opt config_val; do + case "$config_opt" in + \#* | "") + ;; + raw|ext2|ext3|ext4|minix|btrfs|xfs|nfs|ssh|sshkey|path|core_collector|kdump_post|kdump_pre|extra_bins|extra_modules|default|force_rebuild|force_no_rebuild|dracut_args|fence_kdump_args|fence_kdump_nodes) + [ -z "$config_val" ] && { + echo "Invalid kdump config value for option $config_opt." + return 1; + } + if [ -d "/proc/device-tree/ibm,opal/dump" ] && [ "$config_opt" == "raw" ]; then + echo "WARNING: Won't capture opalcore when 'raw' dump target is used." + fi + ;; + net|options|link_delay|disk_timeout|debug_mem_level|blacklist) + echo "Deprecated kdump config option: $config_opt. Refer to kdump.conf manpage for alternatives." + return 1 + ;; + *) + echo "Invalid kdump config option $config_opt" + return 1; + ;; + esac + done <<< "$(read_strip_comments $KDUMP_CONFIG_FILE)" + + check_failure_action_config || return 1 + check_final_action_config || return 1 + check_fence_kdump_config || return 1 + + return 0 +} + + +check_kdump_status(){ + systemctl status kdump >/dev/null + return $? +} + +ops_run(){ + check_kdump_status + if [ $? -ne 0 ]; then + check_kdump_config + if [ $? -ne 0 ]; then + echo "/etc/kdump.conf check fails, /etc/kdump.conf will be overwritten with default configuration, old configuration is archived in /etc/kdump.conf.old" + cp /etc/kdump.conf /etc/kdump.conf.old + cp -f /usr/lib/tencentos-tools/ops/kdump/kdump.conf /etc/kdump.conf + systemctl restart kdump >/dev/null + if [ $? -ne 0 ]; then + echo 'The kdump service status is abnormal after restart, configuration check is passed, there may be other reasons for the service failure, See "systemctl status kdump.service" and "journalctl -xe" for details.' + return 1 + fi + echo "The kdump service status is normal with default conf." + return 0 + fi + systemctl restart kdump >/dev/null + if [ $? -ne 0 ]; then + echo 'The kdump service status is abnormal, configuration check is passed, there may be other reasons for the service failure, See "systemctl status kdump.service" and "journalctl -xe" for details.' + return 1 + fi + fi + echo "kdump service is normal." + return 0 +} + +ops_run diff --git a/ops/system/ops-help b/ops/system/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..790c94d5c42d22383e6be61123ecf88884b05f13 --- /dev/null +++ b/ops/system/ops-help @@ -0,0 +1 @@ +Misc tools diff --git a/ops/system/tracesig/lib.sh b/ops/system/tracesig/lib.sh new file mode 100755 index 0000000000000000000000000000000000000000..12dbfccf2218bcdcc853c3f18d0e24949195ffbf --- /dev/null +++ b/ops/system/tracesig/lib.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set_cgroup_rights() +{ + if [ -e /sys/fs/cgroup/cgroup.procs ]; then + ## cgroup v2 + echo "$PPID" > /sys/fs/cgroup/cgroup.procs + echo "$$" > /sys/fs/cgroup/cgroup.procs + else + if [ -e /sys/fs/cgroup/cpuset/cgroup.procs ]; then + ## cgroup v1 + echo "$PPID" > /sys/fs/cgroup/cpuset/cgroup.procs 2>/dev/null + echo "$$" > /sys/fs/cgroup/cpuset/cgroup.procs 2>/dev/null + fi + + + if [ -e /sys/fs/cgroup/memory/cgroup.procs ]; then + ## cgroup v1 + echo "$PPID" > /sys/fs/cgroup/memory/cgroup.procs 2>/dev/null + echo "$$" > /sys/fs/cgroup/memory/cgroup.procs 2>/dev/null + fi + fi +} + +export -f set_cgroup_rights diff --git a/ops/system/tracesig/ops-help b/ops/system/tracesig/ops-help new file mode 100644 index 0000000000000000000000000000000000000000..b43c147ba19b6e7038b2e16c8e8f3af8b45b1af1 --- /dev/null +++ b/ops/system/tracesig/ops-help @@ -0,0 +1 @@ +Trace signals diff --git a/ops/system/tracesig/ops-run b/ops/system/tracesig/ops-run new file mode 120000 index 0000000000000000000000000000000000000000..d8419e4b7e59f069632bc72dceb0a2bd8e83aba6 --- /dev/null +++ b/ops/system/tracesig/ops-run @@ -0,0 +1 @@ +tracesig.sh \ No newline at end of file diff --git a/ops/system/tracesig/tracesig.sh b/ops/system/tracesig/tracesig.sh new file mode 100755 index 0000000000000000000000000000000000000000..4e8d370ae12c3719533e8b4a937235a08e1fff87 --- /dev/null +++ b/ops/system/tracesig/tracesig.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +usage="\ +Usage: + t-ops misc tracesig [-p pid| -c comm | -s sig] -e|-d|-r + COMMAND-LINE Options: + -p, 设置要跟踪的接收信号进程pid + -s, 只跟踪某个信号 + -d, 结束跟踪 + -r, 查看信号发送信息 +" + +if (( $# < 1 )); then + echo "$usage" + exit 1 +fi + +export logdir="/data/t-ops/misc/tracesig" + +export curr_dir=$(pwd) +export work_dir=$(readlink /proc/$$/fd/255); work_dir=$(dirname $work_dir); cd "$work_dir" +export run_log="$logdir/run_log" + +source lib.sh +set_cgroup_rights + +tracee_pid=0 +tracee_sig=0 + +check_logdir() +{ + echo "$logdir" | grep ^[/] > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Must using absolute path!"; return 1 ; fi + + echo "$logdir" | grep "misc/tracesig" > /dev/null ; ret_val=$? + if (( $ret_val != 0 )); then echo "Path must including \"misc/tracesig\" substring!"; return 1 ; fi + + echo "$logdir" | grep -E '[ | ]' > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Path must not including space and tab char!"; return 1 ; fi + + return 0 +} + +safe_rm() +{ + echo "$PWD/" | grep "$logdir" > /dev/null ; local ret_val=$? + if (( $ret_val != 0 )); then echo "Forbid rm outside the $logdir dir!"; return 1 ; fi + + echo "$@" | grep "/" > /dev/null ; ret_val=$? + if (( $ret_val == 0 )); then echo "Forbid having \"/\" in args!"; return 1 ; fi + + rm $@ 2>/dev/null +} +export -f safe_rm + +mkdir -p "$logdir" + +filter="--filter" +get_filter() +{ + if (( $tracee_pid != 0 )); then + if (( $tracee_sig != 0 )); then + filter=$( echo "$filter 'pid == $tracee_pid && sig == $tracee_sig'") + return + else + filter=$( echo "$filter 'pid == $tracee_pid'") + fi + fi + + if (( $tracee_sig != 0 )); then + filter=$( echo "$filter 'sig == $tracee_sig'") + fi +} + +perf_pid="" +enable_trace() +{ + check_logdir ; cd $logdir + safe_rm -f * + get_filter + perf_cmd="perf record -e signal:signal_generate $filter &" + eval $perf_cmd + perf_pid=$! + echo $perf_pid > perf_pid +} + +disable_trace() +{ + check_logdir ; cd $logdir + perf_pid=$(cat perf_pid) + kill -15 $perf_pid +} + +read_trace_result() +{ + check_logdir ; cd $logdir + perf script +} + +while getopts 'p:s:edrh' OPT; do + case $OPT in + p) tracee_pid="$OPTARG" + ;; + s) tracee_sig="$OPTARG" + ;; + d) disable_trace + exit 0 + ;; + r) read_trace_result + exit 0 + ;; + h) echo "$usage" + exit $? + ;; + ?) echo "Invalid option: $OPT" >&2 + echo "$usage" + exit 1 + ;; + esac +done + +enable_trace diff --git a/sys-manage/ocos-analyze-performance.sh b/sys-manage/ocos-analyze-performance.sh new file mode 100755 index 0000000000000000000000000000000000000000..29443eac0279ddc59e8b29ca7c12c1271ff7100a --- /dev/null +++ b/sys-manage/ocos-analyze-performance.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# ocos -a : analyze system performance +# joeytao@tencent.com + +ana_uptime() +{ + echo "========== Uptime ===============" + echo "# uptime" + uptime + echo "=================================" +} +ana_vmstat() +{ + echo "========== VMSTAT ===============" + echo "# vmstat 1 3" + vmstat 1 3 + echo "=================================" +} +ana_cpu() +{ + echo "========== CPU ==================" + echo "# mpstat 1 3" + mpstat 1 3 + echo "=================================" +} +ana_mem() +{ + echo "========== Memory ===============" + echo "# free -m" + free -m + echo "=================================" +} +ana_net() +{ + echo "========= Network ===============" + echo "# sar -n DEV 1 3" + sar -n DEV 1 3 + echo "=================================" +} +ana_io() +{ + echo "========= I/O ===================" + echo "# iostat -dx 1 3" + iostat -dx 1 3 + echo "=================================" +} +ana_pidstat() +{ + echo "========= pidstat ===================" + echo "# pidstat 1 3" + pidstat 1 3 + echo "=================================" +} +ana_dmesg() +{ + echo "========= dmesg ===================" + echo "# dmesg | tail" + dmesg | tail + echo "=================================" +} + +ana_performance() +{ + ana_uptime + ana_vmstat + ana_cpu + ana_mem + ana_net + ana_io + ana_pidstat + ana_dmesg +} + + +ocos_analyze() +{ + if [ -n "$1" ]; then + ana_op=$1 + if [ "$ana_op"x == "cpu"x ];then + ana_cpu + elif [ "$ana_op"x == "io"x ];then + ana_io + elif [ "$ana_op"x == "mem"x ];then + ana_mem + elif [ "$ana_op"x == "net"x ];then + ana_net + else + ana_performance + fi + else + ana_performance + fi +} +#ocos_analyze $1 diff --git a/sys-manage/ocos-backup.sh b/sys-manage/ocos-backup.sh new file mode 100755 index 0000000000000000000000000000000000000000..24f116cf978b71beae45033aa4975d3ca567fc2a --- /dev/null +++ b/sys-manage/ocos-backup.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Live OpenCloudOS Backup, 2020 +# Songqiao Tao + +new_dir() +{ + local newdir="$*" + i=0 + while [ -e $newdir ]; do + i=`expr $i + 1` + newdir="$*-$i" + done + echo $newdir +} + +rebuildtree() +{ +# Remounting the linux directories effectively excludes removable media, manually mounted devices, windows partitions, virtual files under /proc, /sys, /dev, etc. If your partition scheme is more complicated than listed below, you must add lines to rebuildtree() and destroytree(), otherwise the backup will be partial. + mkdir /$1 + mount --make-private --bind / /$1 + mount --make-private --bind /boot /$1/boot + mount --make-private --bind /boot/efi /$1/boot/efi + mount --make-private --bind /home /$1/home + #mount --make-private --bind /tmp /$1/tmp + #mount --make-private --bind /usr /$1/usr + #mount --make-private --bind /var /$1/var + #mount --make-private --bind /srv /$1/srv + #mount --make-private --bind /opt /$1/opt + mount --make-private --bind /usr/local /$1/usr/local +} + +destroytree() +{ + umount /$1/usr/local + #umount /$1/opt + #umount /$1/srv + #umount /$1/var + #umount /$1/usr + #umount /$1/tmp + umount /$1/home + umount /$1/boot/efi + umount /$1/boot + umount /$1 + rmdir /$1 +} + +ocos_backup() +{ + if [ -n "$1" ]; then + back_op=$1 + if [ "$back_op"x == "reboot"x ];then + /usr/lib/opencloudos-tools/sys-manage/opencloudos_super_tool.py -b + else + echo "Please type the correct parameters!" + echo "ocos -b : backup the system online" + echo "ocos -b reboot : reboot to backup the system" + fi + exit 0 + fi + if [ ! -x /usr/sbin/mksquashfs ]; then + echo "Try to install squashfs-tools rpm by yum!" + yum -y install squashfs-tools > /dev/null 2>&1 + fi + if [ ! -x /usr/sbin/mksquashfs ]; then + echo "/usr/sbin/mksquashfs not found, please install squashfs-tools first!" + exit -1 + fi + + bindingdir=`new_dir /tmp/bind` + backupdir="/data/opencloudos/backup" + bindingdir="${bindingdir#/}" + backupdir="${backupdir#/}" + + exclude=`new_dir /tmp/exclude` + echo $backupdir > $exclude + echo $bindingdir >> $exclude + echo etc/udev/rules.d/70-persistent-net.rules >> $exclude + echo etc/machine-id >> $exclude + echo lost+found >> $exclude + echo data/lost+found >> $exclude + echo usr/local/lost+found >> $exclude + echo var/cache/yum >> $exclude + + for i in `swapon -s | grep file | cut -d " " -f 1`; do + echo "${i#/}" >> $exclude + done + + for i in `ls /tmp -A`; do + echo "tmp/$i" >> $exclude + done + + rebuildtree $bindingdir + + VER=$(awk '/OpenCloudOS/{print $(NF)}' /etc/opencloudos-release) + mkdir -p "/$backupdir" + sqfs=opencloudos-64bit-v${VER}-backup.`date +"%Y-%m-%d-%H:%M"`.sqfs + mksquashfs /$bindingdir "/$backupdir/${sqfs}" -comp xz -b 262144 -ef $exclude + destroytree $bindingdir + rm $exclude + echo "Your backup is ready in /$backupdir/${sqfs}" +} + +#ocos_backup diff --git a/sys-manage/ocos-check-health.sh b/sys-manage/ocos-check-health.sh new file mode 100755 index 0000000000000000000000000000000000000000..dcb9abb3311d134ee991d72f7ecdd134efd3707a --- /dev/null +++ b/sys-manage/ocos-check-health.sh @@ -0,0 +1,296 @@ +#!/bin/bash +# OpenCloudOS Health Check, 2022 +# Minjie HUANG + +echo "##########################################################################" +echo "# #" +echo "# OpenCloudOS Health Check #" +echo "# #" +echo "##########################################################################" +echo "# #" +echo "# WARNING: This check script is based on part 8.1.4 of GB/T 22239-2019, #" +echo "# which will ONLY CHECK the system settings and will NOT MODIFY #" +echo "# any propertities of the system. #" +echo "# The suggestions provided is only for reference, which cannot #" +echo "# ensure the system meet the security technical requirements. #" +echo "# The suggestions may also impact on your business program. #" +echo "# Please modify the settings carefully according to the actual #" +echo "# requirements. #" +echo "# #" +echo "##########################################################################" +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>System Basic Info<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" +hostname=$(uname -n) +ocrelease=$(head /etc/opencloudos-release) +kernel=$(uname -r) +platform=$(uname -p) +address=$(ip addr | grep inet | grep -v "inet6" | grep -v "127.0.0.1" | awk '{ print $2; }' | tr '\n' '\t' ) +cpumodel=$(cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq) +cpu=$(cat /proc/cpuinfo | grep 'processor' | sort | uniq | wc -l) +machinemodel=$(dmidecode | grep "Product Name" | sed 's/^[ \t]*//g' | tr '\n' '\t' ) +date=$(date) + +echo -e "Hostname:\t\t$hostname" +echo -e "OpenCloudOS Release:\t$ocrelease" +echo -e "Kernel Version:\t\t$kernel" +echo -e "Platform:\t\t$platform" +echo -e "IP Address:\t\t$address" +echo -e "CPU Model:\t\t$cpumodel" +echo -e "CPU Cores:\t\t$cpu" +echo -e "Machine Model:\t\t$machinemodel" +echo -e "System Time:\t\t$date" +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Resource Usage<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" +summemory=$(free -h |grep "Mem:" | awk '{print $2}') +freememory=$(free -h |grep "Mem:" | awk '{print $4}') +usagememory=$(free -h |grep "Mem:" | awk '{print $3}') +uptime=$(uptime | awk '{print $2" "$3" "$4" "$5}' | sed 's/,$//g') +loadavg=$(uptime | awk '{print $9" "$10" "$11" "$12" "$13}') + +echo -e "Mem Total:\t\t$summemory" +echo -e "Mem Usage:\t\t$usagememory" +echo -e "Mem Free:\t\t$freememory" +echo -e "Uptime:\t\t$uptime" +echo -e "Load:\t\t$loadavg" +echo "==========================================================================" +echo "Mem Status:" +vmstat 2 5 +echo "==========================================================================" +echo "Zombie Process:" +ps -ef | grep zombie | grep -v grep +if [ $? == 1 ];then + echo ">>>No zombie process found." +else + echo ">>>Zombie process found.------[WARN]" +fi +echo "==========================================================================" +echo "Top 5 CPU Usage Processes:" +ps auxf |sort -nr -k 3 |head -5 +echo "==========================================================================" +echo "Top 5 Mem Usage Processes:" +ps auxf |sort -nr -k 4 |head -5 +echo "==========================================================================" +echo "ENV:" +env +echo "==========================================================================" +echo "Route:" +route -n +echo "==========================================================================" +echo "Ports:" +netstat -tunlp +echo "==========================================================================" +echo "Connections:" +netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}' +echo "==========================================================================" +echo "Startup Services:" +systemctl list-unit-files | grep enabled +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>System User Info<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" +echo "Active Users:" +w | tail -n +2 +echo "==========================================================================" +echo "All Users:" +cut -d: -f1,2,3,4 /etc/passwd +echo "==========================================================================" +echo "All Groups:" +cut -d: -f1,2,3 /etc/group +echo "==========================================================================" +echo "Crontab:" +crontab -l +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>Identification Security<<<<<<<<<<<<<<<<<<<<<<<<<<" +grep -i "^password.*requisite.*pam_cracklib.so" /etc/pam.d/system-auth > /dev/null +if [ $? == 0 ];then + echo ">>>Password Complexity: Set" +else + grep -i "pam_pwquality\.so" /etc/pam.d/system-auth > /dev/null + if [ $? == 0 ];then + echo ">>>Password Complexity: Set" + else + echo ">>>Password Complexity: Not set--------[WARN]" + fi +fi +echo "==========================================================================" +awk -F":" '{if($2!~/^!|^*/){print ">>>("$1")" " is an unlocked user--------[WARN]"}}' /etc/shadow +echo "==========================================================================" +more /etc/login.defs | grep -E "PASS_MAX_DAYS" | grep -v "#" |awk -F' ' '{if($2!=90){print ">>>Password expiration time is "$2" days, please change to 90 days.------[WARN]"}}' +echo "==========================================================================" +grep -i "^auth.*required.*pam_tally2.so.*$" /etc/pam.d/sshd > /dev/null +if [ $? == 0 ];then + echo ">>>Login Failed Policy: On" +else + echo ">>>Login Failed Policy: Off, please add login failed lock.----------[WARN]" +fi +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>Access Control Security<<<<<<<<<<<<<<<<<<<<<<<<<<" +echo "Non preset users:" +more /etc/passwd |awk -F ":" '{if($3>500){print ">>>/etc/passwd User: "$1 " UID: "$3" is not a preset user, please check.--------[WARN]"}}' +echo "==========================================================================" +echo "Privilege users:" +awk -F: '$3==0 {print $1}' /etc/passwd +echo "==========================================================================" +echo "Empty password users:" +awk -F: '($2=="!!") {print $1" has no password, please check.-------[WARN]"}' /etc/shadow +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Security Audit<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" +echo "History of all logged in users in past 30 days:" +last | head -n 30 +echo "==========================================================================" +echo "Check syslog audit service:" +if systemctl status rsyslog | egrep " active \(running";then + echo ">>>syslog service on" +else + echo ">>>syslog service off---------[WARN]" +fi +echo "==========================================================================" +echo "Check syslog forwarding:" +if more /etc/rsyslog.conf | egrep "@...\.|@..\.|@.\.|\*.\* @...\.|\*\.\* @..\.|\*\.\* @.\.";then + echo ">>>syslog forwarding on" +else + echo ">>>syslog forwarding off---------[WARN]" +fi +echo "==========================================================================" +echo "Audit featrues and logs:" +more /etc/rsyslog.conf | grep -v "^[$|#]" | grep -v "^$" +echo "==========================================================================" +echo "Key files modified time:" +ls -ltr /bin/ls /bin/login /etc/passwd /bin/ps /etc/shadow|awk '{print ">>>Filename: "$9" ""Last modified time: "$6" "$7" "$8}' + +echo "==========================================================================" +echo "Check key log files:" +log_secure=/var/log/secure +log_messages=/var/log/messages +log_cron=/var/log/cron +log_boot=/var/log/boot.log +log_dmesg=/var/log/dmesg +if [ -e "$log_secure" ]; then + echo ">>>/var/log/secure exists" +else + echo ">>>/var/log/secure not exists------[WARN]" +fi +if [ -e "$log_messages" ]; then + echo ">>>/var/log/messages exists" +else + echo ">>>/var/log/messages not exists------[WARN]" +fi +if [ -e "$log_cron" ]; then + echo ">>>/var/log/cron exists" +else + echo ">>>/var/log/cron not exists--------[WARN]" +fi +if [ -e "$log_boot" ]; then + echo ">>>/var/log/boot.log exists" +else + echo ">>>/var/log/boot.log not exists--------[WARN]" +fi +if [ -e "$log_dmesg" ]; then + echo ">>>/var/log/dmesg exists" +else + echo ">>>/var/log/dmesg not exists--------[WARN]" +fi +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>Residual Information Protection<<<<<<<<<<<<<<<<<<<<<<" +echo "Disk Partitions:" +echo "Please check the utilization rates---------[INFO]" +df -h +echo "==========================================================================" +echo "Block devices:" +lsblk +echo "==========================================================================" +echo "Filesystem info:" +more /etc/fstab | grep -v "^#" | grep -v "^$" +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>Intrusion Proetction Security<<<<<<<<<<<<<<<<<<<<<<<" +echo "System instrustion log:" +more /var/log/secure |grep refused +if [ $? == 0 ];then + echo ">>>Instrustion found--------[WARN]" +else + echo ">>>Instrustion not found" +fi +echo "==========================================================================" +echo "Login failed history:" +lastb | head | grep -v "^$" | grep -v "btmp" > /dev/null +if [ $? == 1 ];then + echo ">>>No login failed history." +else + echo ">>>Login failed history found.--------[WARN]" + lastb | head | grep -v "^$" | grep -v "btmp" +fi +echo "==========================================================================" +echo "SSH Login Failed Info:" +more /var/log/secure | grep "Failed" > /dev/null +if [ $? == 1 ];then + echo ">>>No SSH login failed info found." +else + more /var/log/secure|awk '/Failed/{print $(NF-3)}'|sort|uniq -c|awk '{print ">>>Failed IP: "$2"="$1"times---------[WARN]";}' +fi +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>Malicious Code Protection<<<<<<<<<<<<<<<<<<<<<<<<<" +echo "Check Anti-Virus Software:" +crontab -l | grep clamscan.sh > /dev/null +if [ $? == 0 ];then + echo ">>>ClamAV installed." + crontab -l | grep freshclam.sh > /dev/null + if [ $? == 0 ];then + echo ">>>Virus database update crontab set." + fi +else + echo ">>>ClamAV not installed, please install anti-virus software.--------[INFO]" +fi +echo " " +echo ">>>>>>>>>>>>>>>>>>>>>>>>>Resource Control Security<<<<<<<<<<<<<<<<<<<<<<<<<" +echo "Check xinetd service:" +if ps -elf |grep xinet |grep -v "grep xinet";then + echo ">>>xinetd service on, please check whether can turn it off--------[INFO]" +else + echo ">>>xinetd service off" +fi +echo "==========================================================================" +echo "Check SSH service:" +if systemctl status sshd | grep -E "listening on|active \(running\)"; then + echo ">>>SSH service on." +else + echo ">>>SSH service off--------[WARN]" +fi +echo "==========================================================================" +echo "Check Telnet-Server service:" +if more /etc/xinetd.d/telnetd 2>&1|grep -E "disable=no"; then + echo ">>>Telnet-Server service on" +else + echo ">>>Telnet-Server service off--------[INFO]" +fi +echo "==========================================================================" +ps axu | grep iptables | grep -v grep || ps axu | grep firewalld | grep -v grep +if [ $? == 0 ];then + echo ">>>Firewall on" +iptables -nvL --line-numbers +else + echo ">>>Firewall off--------[WARN]" +fi +echo "==========================================================================" +echo "Check SSH policy (hosts.deny):" +more /etc/hosts.deny | grep -E "sshd" +if [ $? == 0 ]; then + echo ">>>hosts.deny set" +else + echo ">>>hosts.deny not set--------[WARN]" +fi +echo "==========================================================================" +echo "Check SSH policy (hosts.allow):" +more /etc/hosts.allow | grep -E "sshd" +if [ $? == 0 ]; then + echo ">>>hosts.allow set" +else + echo ">>>hosts.allow not set--------[WARN]" +fi +echo "==========================================================================" +echo "Use hosts.allow by default when hosts.allow conflicts with host.deny." +echo "==========================================================================" +grep -i "TMOUT" /etc/profile /etc/bashrc +if [ $? == 0 ];then + echo ">>>Login timeout set." +else + echo ">>>Login timeout not set. Please add TMOUT=600 in /etc/profile or /etc/bashrc--------[WARN]" +fi +echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>end<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" \ No newline at end of file diff --git a/sys-manage/ocos-lib.sh b/sys-manage/ocos-lib.sh new file mode 100755 index 0000000000000000000000000000000000000000..d7a7bbc477ec75b1217e5a24de02fc3801dc952f --- /dev/null +++ b/sys-manage/ocos-lib.sh @@ -0,0 +1,386 @@ +#!/bin/sh +# +# ocos common variables and functions +# author: g_CAPD_SRDC_OS@tencent.com +# + +. /usr/lib/opencloudos-tools/sys-manage/ocos-backup.sh +. /usr/lib/opencloudos-tools/sys-manage/ocos-analyze-performance.sh +OC_RELEASE="/etc/opencloudos-release" +KERNEL_VER=$(uname -r) +OC_VER_V="" +OC_VER="" +OC_DATE="" + + + +#get tlinux version and date +getVersionDate() +{ + if [ -f /etc/motd ]; then + OC_VER=$(awk '/OpenCloudOS/{print $NF}' /etc/opencloudos-release) + fi + if [ -f $OC_RELEASE ]; then + OC_VER_V=$(head $OC_RELEASE) + fi + +} + +#check machine type, dependent on virt-what +vmwhat() +{ + if [ ! -x /usr/sbin/virt-what ]; then + echo "Try to install virt-what rpm by yum!" + yum -y install virt-what > /dev/null 2>&1 + fi + if [ -x /usr/sbin/virt-what ]; then + echo -e -n "Machine type:\t\t" + v_what=$(/usr/sbin/virt-what) + [ -z "$v_what" ] && echo "Physical machine" && return 0 + echo $v_what | grep -q -i "virtualbox" && echo "Virtualbox guest" && return 0 + echo $v_what | grep -q -i "vmware" && echo "VMware guest" && return 0 + echo "$v_what" + return 0 + fi +} + +#show opencloudos system information +ocos_show() +{ + echo "=============== System Information ===============" + # get SN + if [ -x /usr/sbin/dmidecode ]; then + seri_num=$(dmidecode -s system-serial-number | tail -n1) + [ -n "$seri_num" ] && echo -e "Serial Number:\t\t$seri_num" + fi + + # get IP + eth1_ip=$(ip a | grep "inet" | grep "eth1" | awk '{print $2}' | awk -F/ '{print $1}') + if [ -n "$eth1_ip" ]; then + echo -e "eth1 IP:\t\t$eth1_ip" + else + other_ip=$(ip a|grep "inet"|grep -v "inet6"|grep -v "127.0"|head -n1|awk '{print $2}'|awk -F/ '{print $1}') + [ -n "$other_ip" ] && echo -e "IP:\t\t$other_ip" + fi + + # get machine type + if which rpm &> /dev/null ;then + vmwhat + fi + + # get system version + [ -n "$KERNEL_VER" ] && echo -e "Kernel version:\t\t$KERNEL_VER" + + getVersionDate + if [ -n "$OC_VER_V" ]; then + echo -e "OpenCloudOS release:\t$OC_VER_V" + elif [ -f /etc/os-release ]; then + echo -e -n "OS release:\t\t" + awk -F'"' '/PRETTY_NAME/ {print $2}' /etc/os-release + fi + + if [ -n "$OC_VER" ]; then + echo -e "Release version:\t$OC_VER" + fi + if [ -n "$OC_DATE" ]; then + echo -e "Release date:\t\t$OC_DATE" + fi + + # get rpm version + if which rpm &> /dev/null ;then + GCC_VERSION=$(rpm -q gcc | grep -v "not" | head -n1) + [ -n "$GCC_VERSION" ] && echo -e "Gcc version:\t\t$GCC_VERSION" + GLIBC_VERSION=$(rpm -q glibc | grep -v "not" | head -n1) + [ -n "$GLIBC_VERSION" ] && echo -e "Glibc version:\t\t$GLIBC_VERSION" + SYSTEMD_VERSION=$(rpm -q systemd | grep -v "not" | head -n1) + [ -n "$SYSTEMD_VERSION" ] && echo -e "Systemd version:\t$SYSTEMD_VERSION" + PYTHON_VERSION=$(rpm -q python | grep -v "not" | head -n1) + [ -n "$PYTHON_VERSION" ] && echo -e "Python version:\t\t$PYTHON_VERSION" + fi +} + + +#opencloudos check rpms +ocos_check() +{ + if [ -n "$1" ]; then + rpm -qa | grep $1 > /tmp/rpms_list.txt + else + echo "It may take few minitues!" + rpm -qa > /tmp/rpms_list.txt + + + fi + for i in $(cat /tmp/rpms_list.txt) + do + result=$(rpm -q -V $i) + if [ -n "$result" ]; then + echo "$i:" + echo $result + fi + done +} + + +#opencloudos update rpms +ocos_update() +{ + if [ -n "$1" ]; then + yum update $@ + else + yum update + fi +} + +#opencloudos install rpms +ocos_install() +{ + if [ -n "$1" ]; then + yum install $@ + else + echo "You Nedd to pass a list of pkgs to install" + fi +} + + +# Yum Check Available Package Updates +ocos_check_update() +{ + #To see which installed packages on your system have updates available, use the following command + yum check-update +} + +# Recover or Reinstall the system +ocos_recover() +{ + /usr/lib/opencloudos-tools/sys-manage/opencloudos_super_tool.py -r $@ +} + +# System Health Check +ocos_check_health() +{ + health_op=$1 + health_output_path=$2 + if [ "$health_op"x == "-o"x ]; then + if [ -d "$health_output_path" ]; then + /usr/lib/opencloudos-tools/sys-manage/ocos-check-health.sh | tee ${health_output_path}/health_check_`date +%Y%m%d_%H%M%S`.txt + echo "Your health check report has been saved at ${health_output_path}." + else + echo "Output path does not exists." + mkdir -p /data/opencloudos/health-check + /usr/lib/opencloudos-tools/sys-manage/ocos-check-health.sh | tee /data/opencloudos/health-check/health_check_`date +%Y%m%d_%H%M%S`.txt + echo "Your health check report has been saved at /data/opencloudos/health-check." + fi + else + /usr/lib/opencloudos-tools/sys-manage/ocos-check-health.sh + fi +} +function strstr() +{ + echo $1 | grep $2 +} + +function record_function_call() +{ + $TOP/sys-manage/record_call.py $TOP/call_stat.txt $1 +} + +ops() +{ + local TOP=/usr/lib/opencloudos-tools RUN=ops-run DSC=ops-help sub=ops help= bad= + + if [ ! -f "$TOP/call_stat.txt" ]; then + touch $TOP/call_stat.txt + fi + if [ "$1" = "--bashcomp" ]; then + shift + help=comp + #bashcomp mode, ignore debug + [ "$1" = "-d" ] && shift + #no completion if -v + [ "$1" = "-v" ] && exit 0 + else + if [ "$1" = "-d" ]; then + shift + set -x + fi + + if [ "$1" = "-v" ]; then + read help < $TOP/ops/VERSION + echo oc-ops v$help + exit 0 + fi + fi + + sub_num=$# + sub_show=$# + work_one=$1 + while [ $# -gt 0 ]; do + if [ -x "$TOP/$sub/$RUN" ]; then + break + elif [ "$1" = "-h" -o "$1" = "help" ]; then + [ "$help" = "comp" ] && exit 0 + help=help + break + elif [[ "$1" =~ ^[^a-zA-Z0-9] || "$1" =~ [^-_.a-zA-Z0-9] ]]; then + bad=1 + break + elif [ -d "$TOP/$sub/$1" ]; then + sub="$sub/$1" + shift + else + bad=1 + #bad cmd found, no bashcomp available + [ -n "$bashcomp" ] && exit 0 + break + fi + done + + if [ -z "$bad" -a -x "$TOP/$sub/$RUN" ]; then + #leaf cmd found, no bashcomp available + if [ "$help" != "comp" ]; then + #run cmd with remain args + record_function_call $sub + exec "$TOP/$sub/$RUN" "$@" + exit 255 + fi + # do leaf cmd completion + sub="$sub/ocos-comp" + while [ $# -gt 0 ]; do + sub="$sub.$1" + shift + done + [ -r "$TOP/$sub.txt" ] && cat "$TOP/$sub.txt" + exit 0 + fi + + if [ -z "$help" ]; then + # print error message if not bashcomp + local ops="${sub//\// }" + [ $# -gt 0 ] && echo ERROR: ocos $ops: command \"$1\" invalid + echo Available command for '"'ocos $ops'"': + fi + + cd "$TOP/$sub" || exit 1 + + local msg run op n=0 w=0 + declare -a ops msgs files + + if [ "$help" = "comp" ]; then + # bashcomp only search one level commands + files=( * ) + else + # -h/help print all sub commands + IFS=$'\n' files=( $(find . -type d -name "[a-zA-Z0-9]*" -print) ) + fi + + for op in "${files[@]}"; do + # strip leadding ./ + op="${op#./}" + #filter out invalid characters + [[ "$op" =~ (^|/)[^a-zA-Z0-9] ]] && continue + [[ "$op" =~ [^-_.a-zA-Z0-9/] ]] && continue + #need non-empty, readable $DSC file + [ -r "$op/$DSC" ] || continue + #find command need show all dir, other command show its sub only + if [ $sub_show -lt 1 ] || [[ -z $(strstr $work_one "find") && -z $(strstr $work_one "show_comm") ]]; then + tmp=$op + while [ $sub_num -ge 1 ]; do + sub_num=$(($sub_num - 1)) + tmp="${op#/}" + done + if [ ! -z $(strstr $tmp "/") ]; then + continue + fi + fi + read msg < "$op/$DSC" + [ -z "$msg" ] && continue + + if [ $sub_show -eq 1 ] && [ ! -z $(strstr $work_one "show_comm") ]; then + if [ ! -f "$op/$RUN" ]; then + continue + fi + fi + + if [ "$help" = "comp" ]; then + # bashcomp don't like tab + #ATTN: tab after double //, convert tab to space + msg=${msg// / } + else + # convert / to space + op="${op//\// }" + fi + + #save and update max-width + [ ${#op} -gt $w ] && w=${#op} + eval 'ops['$n']="$op"' + eval 'msgs['$n']="$msg"' + n=$((n+1)) + done + + #show results + w="%-${w}s %s\n" + op=0 + #support find + if [ $sub_show -ge 1 ] && [ ! -z $(strstr $work_one "find") ]; then + if [ $sub_show -eq 1 ]; then + printf "\033[32menter anything you want, such as: oc-ops find file/fs/net .etc\033[0m\n" + printf "\033[32menter anything you want, such as: oc-ops find 文件/文件相关/网络相关/等\033[0m\n" + elif [ $sub_show -eq 2 ] && [ ! -z $(strstr $1 "find") ]; then + while [ $op -lt $n ]; do + if [ ! -z $(strstr "${ops[$op]}" $2) ] || [ ! -z $(strstr "${msgs[$op]}" $2) ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + fi + elif [ $sub_show -ge 1 ] && [ ! -z $(strstr $work_one "show_comm") ]; then + i=0 + printf "\033[35m-------show HCI command-------\033[0m\n" + while [ $op -lt $n ]; do + if [ ! -z $(strstr ${ops[$op]} "interaction") ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + unset ops[$op] + unset msgs[$op] + i=$((i+1)) + fi + op=$((op+1)) + done + printf "\033[35m-------show paramter command-------\033[0m\n" + op=0 + while [ $op -lt $n ]; do + if [ ! -z "${ops[$op]}" ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + #show sub level( < level 1) command + elif [ $sub_show -ge 1 ]; then + while [ $op -lt $n ]; do + printf "$w" "${ops[$op]}" "${msgs[$op]}" + op=$((op+1)) + done + #show level 1 command in order + else + i=0 + while [ $op -lt $n ]; do + if [ ! -z $(strstr ${ops[$op]} "interaction") ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + unset ops[$op] + unset msgs[$op] + i=$((i+1)) + fi + op=$((op+1)) + done + op=0 + n=$(($n - $i + 1)) + while [ $op -lt $n ]; do + if [ ! -z "${ops[$op]}" ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + fi + + exit 0 +} diff --git a/sys-manage/opencloudos_super_tool.py b/sys-manage/opencloudos_super_tool.py new file mode 100755 index 0000000000000000000000000000000000000000..d28f94d438a6ec064ad42f092e4d7ae8868dd120 --- /dev/null +++ b/sys-manage/opencloudos_super_tool.py @@ -0,0 +1,333 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# ==================================== +# Created By : Songqiao Tao +# Email : joeytao@tencent.com +# Created Date: Fri Mar 24 2017 +# Update Date : Thu Jun 2 2022 +# Description : Backup and Recover system for OpenCloudOS +# Version : 4.1 +# ==================================== + +import shutil +import os +os.environ["PATH"] = "/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin" +import sys +import getopt +import time +import subprocess +import re +import socket +import glob + +def usage(): + print('''Usage: ocos -b | -r [ -f ] [ -p password ] [-s script] -i SQFS_FILE | -h + + Options: + -b Backup OpenCloudOS system + -r Recover OpenCloudOS system + -i SQFS_FILE The path to the sqfs file + -f Format data, e.g. /dev/sda4 in physical machine, /dev/vdb1 in VM, /data in VM without data disk + -p PASSWORD Set the new password + -s SCRIPT An executable file, e.g. a script with shebang to run within the installed new system + -u RPM An rpm file, e.g. kernel-xxx.rpm to update within the installed new system + -d disk specific installation disk. e.g. sda, default use the root filesystem disk. + -U use UUID + -h Print this short help text and exit +''') + +def exit_script(msg=''): + if msg: + print(msg) + print("") + usage() + sys.exit(3) + +def check_img(img): + if not img: + exit_script("-i option is not given") + if not os.path.isfile(img): + exit_script('%s does not exist or is not a regular file.' % img) + if not img.endswith('.sqfs'): + exit_script('Support .sqfs file only.') + +def get_disk(folder): + prop = subprocess.Popen("df %s | tail -1 | awk '{print $1}'" % folder, shell=True, stdout=subprocess.PIPE) + dev = prop.stdout.read().strip().decode('utf-8') + return dev + +def get_uuid(disk): + prop = subprocess.Popen("blkid -s UUID -o value %s" % disk, shell=True, stdout=subprocess.PIPE) + uuid = prop.stdout.read().strip().decode('utf-8') + if uuid == "": + print("Can't get %s UUID, some error may happened, check it" % (disk)) + sys.exit(1) + return 'UUID=' + uuid + +def check_data_disk(): + f = open("/etc/mtab", "r") + for line in f: + dev = line.split()[0] + mnt = line.split()[1] + if dev == "/dev/sda4" or dev == "/dev/vdb1": + f.close() + return mnt, dev + f.close() + + if os.path.exists("/data"): + dev = get_disk("/data") + return "/data", dev + else: + try: + os.mkdir("/data") + dev = get_disk("/data") + return "/data", dev + except: + exit_script("/data create failed.") + + +def is_opencloudos(): + if os.path.exists("/etc/opencloudos-release"): + return True + return False + +def is_vm(): + if os.path.isfile("/proc/cmdline"): + f = open("/proc/cmdline") + for line in f: + if "xvda" in line: + f.close() + return True + f.close() + + p = subprocess.Popen(["dmidecode"], stdout=subprocess.PIPE) + vm_desktop_pt = re.compile(r"Manufacturer: innotek GmbH|Vendor: Parallels|Manufacturer: VMware") + vm_cloud_pt = re.compile(r"Product Name: KVM|Product Name: CVM|Manufacturer: QEMU") + vm_desktop_flag = False + vm_cloud_flag = False + for line in p.stdout: + line = str(line) + m1 = vm_desktop_pt.search(line) + m2 = vm_cloud_pt.search(line) + if m1: + vm_desktop_flag = True + if m2: + vm_cloud_flag = True + if vm_desktop_flag == False and vm_cloud_flag == True: + return True + + return False + +def is_rootsize_max(): + if os.path.isfile("/proc/cmdline"): + f = open("/proc/cmdline") + for line in f: + if "rootsize=max" in line: + f.close() + return True + f.close() + return False + +def check_os(): + if not (is_opencloudos()): + exit_script("only OpenCloudOS is supported") + if os.uname()[-1] != 'x86_64': + exit_script("only x86_64 is supported") + +def get_iface_hwaddr_map(): + tmp_map = {} + for path in glob.glob("/sys/class/net/eth*"): + iface = os.path.basename(path) + hwaddr = open(os.path.join(path, "address")).read().strip() + tmp_map[iface] = hwaddr + if "eth0" not in tmp_map and "eth1" not in tmp_map: + exit_script("machine must have eth0 or eth1") + return tmp_map + +def copy_files(option, img, sqfs_dir, script, rpm_file): + shutil.copy("/usr/lib/opencloudos-tools/initrd-backup-recovery.img", "/boot/initrd-2.0-backup-recovery.img") + shutil.copy("/usr/lib/opencloudos-tools/vmlinuz-backup-recovery", "/boot/vmlinuz-2.0-backup-recovery") + if option == "recovery": + img_basename = os.path.basename(img) + if not os.path.isfile(sqfs_dir + '/' + img_basename): + shutil.copy(img, sqfs_dir) + if script: + shutil.copy(script, os.path.join(sqfs_dir, "hardinstall_script")) + if rpm_file: + shutil.copy(rpm_file, os.path.join(sqfs_dir, "hardinstall_extra.rpm")) + iface_hwaddr_map = get_iface_hwaddr_map() + f = open(os.path.join(sqfs_dir, "hardinstall_iface_hwaddr"), "w") + for key in iface_hwaddr_map: + f.write("%s=%s\n" % (key, iface_hwaddr_map[key])) + f.close() + +def grub_add_entry(option, img, format_data, passwd, vm, data_disk, install_disk): + if format_data: + format_value = 1 + else: + format_value = 0 + if passwd: + pw_value = passwd + else: + pw_value = 0 + + grub_conf = '/boot/grub2/grub.cfg' + + if vm: + console = "tty0 console=ttyS0,115200" + else: + console = "tty0" + img_name = '' + if option == "recovery": + basename = os.path.basename(img) + img_name = basename.replace(".sqfs", "") + title1_recovery = ''' +title OpenCloudOS (Recovery) + root (hd0,0) + kernel /boot/vmlinuz-2.0-backup-recovery quiet elevator=noop i8042.noaux console=%s panic=5 osname=%s installmethod=harddisk recovery-mode format_data=%d passwd=%s + initrd /boot/initrd-2.0-backup-recovery.img +''' % (console, img_name, format_value, pw_value) + + title1_backup = ''' +title OpenCloudOS (Backup) + root (hd0,0) + kernel /boot/vmlinuz-2.0-backup-recovery quiet elevator=noop i8042.noaux console=%s panic=5 backup-mode + initrd /boot/initrd-2.0-backup-recovery.img +''' % (console) + + print('backup the grub_conf') + shutil.copy(grub_conf, grub_conf + ".bak") + + if is_opencloudos(): + print('add new boot entry!') + if os.system('grubby --add-kernel=/boot/vmlinuz-2.0-backup-recovery --title="OpenCloudOS backup and revovery" --initrd=/boot/initrd-2.0-backup-recovery.img --copy-default'): + exit_script("setup grub failed !") + if option == 'backup': + os.system('sed -i "/^options/ s/$/ panic=5 backup-mode/" /boot/loader/entries/*backup-recovery.conf') + else: + os.system('sed -i "/^options/ s/$/ osname=%s installmethod=harddisk installdisk=%s datadisk=%s panic=5 recovery-mode format_data=%d passwd=%s/" \ + /boot/loader/entries/*backup-recovery.conf ' % (img_name, re.escape(install_disk), re.escape(data_disk), format_value, pw_value )) + os.system('grub2-reboot "OpenCloudOS backup and revovery"') + +def get_sshd_ip(): + p = subprocess.Popen(["netstat", "-ntpl"], stdout=subprocess.PIPE) + netstat_pt = re.compile(r"(\d+\.\d+\.\d+.\d+):.+\/sshd") + for line in p.stdout: + line = str(line) + m = netstat_pt.search(line) + if m: + ssh_ip = m.group(1) + p.stdout.close() + p.wait() + return ssh_ip + p.wait() + exit_script("sshd not running or not bound to a specified ipv4 address") + +def check_memory_size_M(size): + MEM_PT = re.compile(r"Mem:\s+(\d+)") + p = subprocess.Popen(["free", "-m"], stdout=subprocess.PIPE) + for line in p.stdout: + line = str(line) + m = MEM_PT.search(line) + if m: + p.stdout.close() + p.wait() + total_size = int(m.group(1)) + if total_size < size: + exit_script("%d MiB total memory is required" % size) + return + exit_script("Fail to get total memory") + +def main(): + try: + opts, args = getopt.getopt(sys.argv[1:], 'brfi:p:s:u:d:Unh', ["help"]) + except getopt.GetoptError as err: + print(str(err)) + usage() + sys.exit(1) + + img = '' + option = '' + passwd = '' + format_data = False + script = '' + rpm_file = '' + use_uuid = False + install_disk = '' + data_disk = '' + for o, a in opts: + if o in ('-h', '--help'): + usage() + sys.exit(0) + if o == '-b': + option = 'backup' + if o == '-r': + option = 'recovery' + if o == '-f': + format_data = True + if o == '-i': + img = os.path.abspath(a) + if o == '-p': + passwd = a + if o == '-d': + install_disk = a + if '/' in install_disk: + print("wrong argument, just input disk name(e.g.: sda) without any slash") + sys.exit(1) + if o == '-U': + use_uuid = True + if o == '-s': + script = os.path.abspath(a) + if not os.path.isfile(script): + exit_script("wrong -s option, %s does not exist or is not a regular file" % script) + if o == '-u': + rpm_file = os.path.abspath(a) + if not os.path.isfile(rpm_file): + exit_script("wrong -u option, %s does not exist or is not a regular file" % rpm_file) + if o == '-n': + print("Backup OpenCloudOS system and no need to reboot") + if os.system('/usr/lib/opencloudos-tools/sys-manage/ocos-backup.sh'): + exit_script("setup grub failed !") + else: + print("Backup OpenCloudOS system successfully!") + sys.exit(0) + if option == '': + print("-b or -r parameter is required!") + usage() + sys.exit(1) + if option == 'recovery': + check_img(img) + + if install_disk == '': + install_disk = get_disk('/').split('/')[-1].strip() + prop = subprocess.Popen('basename "$(readlink -f "/sys/class/block/%s/..")"' % install_disk, shell=True, stdout=subprocess.PIPE) + install_disk = prop.stdout.read().strip().decode('utf-8') + vm = is_vm() + # data_disk is a partition, not a device name + data_dir, data_disk = check_data_disk() + data_disk = data_disk.split('/')[-1] + check_memory_size_M(3000) + check_os() + + if use_uuid: + if data_disk != '': + data_disk = get_uuid("/dev/" + data_disk) + if 'nvme' in install_disk: + install_disk += 'p1' + else: + install_disk += '1' + # we got UUID of one partition of the install disk to find the target device. + install_disk = get_uuid("/dev/" + install_disk) + + copy_files(option, img, data_dir, script, rpm_file) + grub_add_entry(option, img, format_data, passwd, vm, data_disk, install_disk) + + if option == 'backup': + print("\n!! Please reboot to backup the system as soon as possible !!") + os.system('sync') + elif option == 'recovery': + print("\n!! Please reboot to recover or reinstall the system as soon as possible !!") + os.system('sync') + +if __name__ == "__main__": + main() diff --git a/sys-manage/record_call.py b/sys-manage/record_call.py new file mode 100755 index 0000000000000000000000000000000000000000..f43e8050c37c87b493e7018f9a72f6910f8635c1 --- /dev/null +++ b/sys-manage/record_call.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import time + +counter = 0 +start = time.perf_counter() +total_length = 50 + + +def record_file(file, sub): + fin = open(file, "rt") + alllines = fin.readlines() + fin.close() + fout = open(file, "w+") + record_total = 0 + record_sub = 0 + for line in alllines: + #record total call numbers + idx = line.find("total number:"); + if idx != -1: + number = line[idx + 13:] + new_number = str(int(number) + 1) + line = "total number: " + new_number + "\n" + record_total = 1 + fout.writelines(line) + continue + idx = line.find(sub); + if idx != -1: + idx = line.find("number: "); + if idx != -1: + number = line[idx + 7:] + new_number = str(int(number) + 1) + line = sub + " number: " + new_number + "\n" + record_sub = 1 + fout.writelines(line) + continue + fout.writelines(line) + + fout.close() + fin = open(file, "rt") + alllines = fin.readlines() + fin.close() + fappend = open(file, 'a') + if record_total == 0: + fappend.writelines("total number: 1\n") + if record_sub == 0: + line = sub + " number: 1\n" + fappend.write(line) + + fappend.close() + fin = open(file, "rt") + alllines = fin.readlines() + fin.close() + +if __name__ == "__main__": + if len(sys.argv) > 2: + file = sys.argv[1] + sub = sys.argv[2] + record_file(file, sub) diff --git a/sys-manage/tos-lib.sh b/sys-manage/tos-lib.sh new file mode 100755 index 0000000000000000000000000000000000000000..9af6805093beeab7ffe89a176eadc26356bf91f3 --- /dev/null +++ b/sys-manage/tos-lib.sh @@ -0,0 +1,589 @@ +#!/bin/sh +# +# Tos common variables and functions +# author: g_CAPD_SRDC_OS@tencent.com +# + +. /usr/lib/tencentos-tools/sys-manage/tos-fix-dns.sh +. /usr/lib/tencentos-tools/sys-manage/tos-backup.sh +. /usr/lib/tencentos-tools/sys-manage/tos-analyze-performance.sh +TOS_RELEASE="/etc/tlinux-release" +KERNEL_VER=$(uname -r) +TOS_VER_V="" +TOS_VER="" +TOS_DATE="" + +# check tlinux2 +is_tlinux2() +{ + [ ! -f $TOS_RELEASE ] && return 1 + grep -q "Tencent tlinux release 2." $TOS_RELEASE || \ + grep -q "Tencent Linux release 2." $TOS_RELEASE +} + +#check tlinux +is_tlinux() +{ + t_v=$1 + [ ! -f $TOS_RELEASE ] && return 1 + grep -q "Tencent tlinux release $t_v" $TOS_RELEASE || \ + grep -q "Tencent Linux release $t_v" $TOS_RELEASE +} + +#get tlinux version and date +getVersionDate() +{ + if [ -f /etc/motd ]; then + TOS_VER=$(awk '/Version/{print $2}' /etc/motd) + TOS_DATE=$(awk '/Version/{print $NF}' /etc/motd) + fi + if [ -f $TOS_RELEASE ]; then + TOS_VER_V=$(head $TOS_RELEASE) + fi + +} + +#check machine type, dependent on virt-what +vmwhat() +{ + if [ ! -x /usr/sbin/virt-what ]; then + echo "Try to install virt-what rpm by yum!" + yum -y install virt-what > /dev/null 2>&1 + fi + if [ -x /usr/sbin/virt-what ]; then + echo -e -n "Machine type:\t\t" + v_what=$(/usr/sbin/virt-what) + [ -z "$v_what" ] && echo "Physical machine" && return 0 + echo $v_what | grep -q -i "virtualbox" && echo "Virtualbox guest" && return 0 + echo $v_what | grep -q -i "vmware" && echo "VMware guest" && return 0 + echo "$v_what" + return 0 + fi +} + +#show tencentos system information +tos_show() +{ + echo "=============== System Information ===============" + # get SN + if [ -x /usr/sbin/dmidecode ]; then + seri_num=$(dmidecode -s system-serial-number | tail -n1) + [ -n "$seri_num" ] && echo -e "Serial Number:\t\t$seri_num" + fi + + # get IP + eth1_ip=$(ip a | grep "inet" | grep "eth1" | awk '{print $2}' | awk -F/ '{print $1}') + if [ -n "$eth1_ip" ]; then + echo -e "eth1 IP:\t\t$eth1_ip" + else + other_ip=$(ip a|grep "inet"|grep -v "inet6"|grep -v "127.0"|head -n1|awk '{print $2}'|awk -F/ '{print $1}') + [ -n "$other_ip" ] && echo -e "IP:\t\t$other_ip" + fi + + # get machine type + if which rpm &> /dev/null ;then + vmwhat + fi + + # get system version + [ -n "$KERNEL_VER" ] && echo -e "Kernel version:\t\t$KERNEL_VER" + + getVersionDate + if [ -n "$TOS_VER_V" ]; then + echo -e "TencentOS release:\t$TOS_VER_V" + elif [ -f /etc/os-release ]; then + echo -e -n "OS release:\t\t" + awk -F'"' '/PRETTY_NAME/ {print $2}' /etc/os-release + fi + + if [ -n "$TOS_VER" ]; then + echo -e "Release version:\t$TOS_VER" + fi + if [ -n "$TOS_DATE" ]; then + echo -e "Release date:\t\t$TOS_DATE" + fi + + # get rpm version + if which rpm &> /dev/null ;then + GCC_VERSION=$(rpm -q gcc | grep -v "not" | head -n1) + [ -n "$GCC_VERSION" ] && echo -e "Gcc version:\t\t$GCC_VERSION" + GLIBC_VERSION=$(rpm -q glibc | grep -v "not" | head -n1) + [ -n "$GLIBC_VERSION" ] && echo -e "Glibc version:\t\t$GLIBC_VERSION" + SYSTEMD_VERSION=$(rpm -q systemd | grep -v "not" | head -n1) + [ -n "$SYSTEMD_VERSION" ] && echo -e "Systemd version:\t$SYSTEMD_VERSION" + PYTHON_VERSION=$(rpm -q python | grep -v "not" | head -n1) + [ -n "$PYTHON_VERSION" ] && echo -e "Python version:\t\t$PYTHON_VERSION" + fi +} + + +#tencentos check rpms +tos_check() +{ + if [ -n "$1" ]; then + rpm -qa | grep $1 > /tmp/rpms_list.txt + else + echo "It may take few minitues!" + rpm -qa > /tmp/rpms_list.txt + + + fi + for i in $(cat /tmp/rpms_list.txt) + do + result=$(rpm -q -V $i) + if [ -n "$result" ]; then + echo "$i:" + echo $result + fi + done +} + + +#tencentos update rpms +tos_update() +{ + if [ -n "$1" ]; then + yum update $@ + else + yum update + fi +} + +#tencentos install rpms +tos_install() +{ + if [ -n "$1" ]; then + yum install $@ + else + echo "You Nedd to pass a list of pkgs to install" + fi +} + +#tencentos fix yum problems, update tlinux-release rpm +tos_fix_yum() +{ + tos_fix_dns + if [ $? -eq 0 ]; then + # dig mirrors.tencent.com + dig_result=$(/usr/bin/dig mirrors.tencent.com) + answer_section=$(echo $dig_result | grep "ANSWER SECTION:") + # if we have answer section + if [ "$answer_section" == "" ]; then + echo "The DNS can't resolve the domain of mirrors.tencent.com" + echo "Please configure DNS according to this page: http://mirrors.tencent.com/#/document/question " + return 1; + fi + fi + if is_tlinux 1.2 ; then + #tlinux1.2-kvm_guest + uname -r | grep -q "kvm_guest" + if [ $? -eq 0 ]; then + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-release-kvm-guest-1.0-2.tl1.noarch.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-release-kvm-guest-1.0-2.tl1.noarch.rpm + fi + + result=$(rpm -q -V python) + if [ -n "$result" ]; then + echo "$result" + echo "python rpm is changed." + read -r -p "Would you like to reinstall the python rpm?[y/n]" input + case $input in + [yY][eE][sS]|[yY]) + echo "Reinstall python rpm" + rpm -ivh --force https://mirrors.tencent.com/tlinux/1.2/os/x86_64/Packages/python-2.6.6-29.el6.x86_64.rpm + ;; + [nN][oO]|[nN]) + echo "You choose no, exit" + exit 0 + ;; + *) + echo "Invalid input..." + exit 1 + ;; + esac + fi + + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/epel-release-6-12.tl1.noarch.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/epel-release-6-12.tl1.noarch.rpm + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-release-1-11.tl1.x86_64.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-release-1-11.tl1.x86_64.rpm + return $? + fi + if is_tlinux 2.0 ; then + result=$(rpm -q -V python) + if [ -n "$result" ]; then + echo "$result" + echo "python rpm is changed." + read -r -p "Would you like to reinstall the python rpm?[y/n]" input + case $input in + [yY][eE][sS]|[yY]) + echo "Reinstall python rpm" + python_rpm=$(rpm -q python | grep x86_64) + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.0/os/x86_64/Packages/${python_rpm}.rpm + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.0/tlinux/x86_64/RPMS/${python_rpm}.rpm + ;; + [nN][oO]|[nN]) + echo "You choose no, exit" + exit 0 + ;; + *) + echo "Invalid input..." + exit 1 + ;; + esac + fi + + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-release-2-4.tl2.x86_64.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-release-2-4.tl2.x86_64.rpm + return $? + fi + if is_tlinux 2.2 ; then + #tlinux2.2-kvm_guest + uname -r | grep -q "kvm_guest" + if [ $? -eq 0 ]; then + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-release-kvm-guest-1.0-2.tl2.noarch.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-release-kvm-guest-1.0-2.tl2.noarch.rpm + fi + + #tlinux2.2-tkernel3 + uname -r | grep -q "tlinux3" + if [ $? -eq 0 ]; then + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-tkernel3-release-1.1-1.tl2.noarch.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-tkernel3-release-1.1-1.tl2.noarch.rpm + fi + + result=$(rpm -q -V python) + if [ -n "$result" ]; then + echo "$result" + echo "python rpm is changed." + read -r -p "Would you like to reinstall the python rpm?[y/n]" input + case $input in + [yY][eE][sS]|[yY]) + echo "Reinstall python rpm" + python_rpm=$(rpm -q python | grep x86_64) + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.2/os/x86_64/Packages/${python_rpm}.rpm + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.2/tlinux/x86_64/RPMS/${python_rpm}.rpm + ;; + [nN][oO]|[nN]) + echo "You choose no, exit" + exit 0 + ;; + *) + echo "Invalid input..." + exit 1 + ;; + esac + fi + + + rpm -Uvh https://mirrors.tencent.com/tlinux/rpm/tlinux-release-2-11.tl2.x86_64.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/rpm/tlinux-release-2-11.tl2.x86_64.rpm + return $? + fi + + if is_tlinux 2.6 ; then + result=$(rpm -q -V python) + if [ -n "$result" ]; then + echo "$result" + echo "python rpm is changed." + read -r -p "Would you like to reinstall the python rpm?[y/n]" input + case $input in + [yY][eE][sS]|[yY]) + echo "Reinstall python rpm" + python_rpm=$(rpm -q python | grep x86_64) + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.6/os/x86_64/Packages/${python_rpm}.rpm + rpm -ivh --force https://mirrors.tencent.com/tlinux/2.6/tlinux/x86_64/RPMS/${python_rpm}.rpm + ;; + [nN][oO]|[nN]) + echo "You choose no, exit" + exit 0 + ;; + *) + echo "Invalid input..." + exit 1 + ;; + esac + fi + + rpm -Uvh https://mirrors.tencent.com/tlinux/2.6/tlinux/x86_64/RPMS/epel-release-7-13.tl2.noarch.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/2.6/tlinux/x86_64/RPMS/epel-release-7-13.tl2.noarch.rpm + rpm -Uvh https://mirrors.tencent.com/tlinux/2.6/tlinux/x86_64/RPMS/tlinux-release-2-11.tl2.1.x86_64.rpm + [ $? -ne 0 ] && rpm -ivh --force https://mirrors.tencent.com/tlinux/2.6/tlinux/x86_64/RPMS/tlinux-release-2-11.tl2.1.x86_64.rpm + return $? + fi +} + +# tos fix yum, dns, etc. +tos_fix() +{ + fix_op=$1 + if [ "$fix_op"x == "yum"x ];then + tos_fix_yum + elif [ "$fix_op"x == "dns"x ];then + tos_fix_dns + else + echo "tos fix $fix_op: invalid option" + fi +} + +# tos set dns, etc. +tos_set() +{ + set_op=$1 + if [ "$set_op"x == "dns"x ];then + tos_set_dns + elif [ "$set_op"x == "irq"x ];then + if [ ! -x /etc/init.d/irqaffinity ]; then + yum -y install tlinux-irqaffinity + fi + /etc/init.d/irqaffinity restart + else + echo "tos set $set_op: invalid option" + fi +} + +# Yum Check Available Package Updates +tos_check_update() +{ + #To see which installed packages on your system have updates available, use the following command + yum check-update +} + +# Recover or Reinstall the system +tos_recover() +{ + /usr/lib/tencentos-tools/sys-manage/tencentos_super_tool.py -r $@ +} + +# System Health Check +tos_check_health() +{ + health_op=$1 + health_output_path=$2 + if [ "$health_op"x == "-o"x ]; then + if [ -d "$health_output_path" ]; then + /usr/lib/tencentos-tools/sys-manage/tos-check-health.sh | tee ${health_output_path}/health_check_`date +%Y%m%d_%H%M%S`.txt + echo "Your health check report has been saved at ${health_output_path}." + else + echo "Output path does not exists." + mkdir -p /data/tencentos/health-check + /usr/lib/tencentos-tools/sys-manage/tos-check-health.sh | tee /data/tencentos/health-check/health_check_`date +%Y%m%d_%H%M%S`.txt + echo "Your health check report has been saved at /data/tencentos/health-check." + fi + else + /usr/lib/tencentos-tools/sys-manage/tos-check-health.sh + fi +} +function strstr() +{ + echo $1 | grep $2 +} + +function record_function_call() +{ + $TOP/sys-manage/record_call.py $TOP/call_stat.txt $1 +} +ops() +{ + local TOP=/usr/lib/tencentos-tools RUN=ops-run DSC=ops-help sub=ops help= bad= + + if [ ! -f "$TOP/call_stat.txt" ]; then + touch $TOP/call_stat.txt + fi + if [ "$1" = "--bashcomp" ]; then + shift + help=comp + #bashcomp mode, ignore debug + [ "$1" = "-d" ] && shift + #no completion if -v + [ "$1" = "-v" ] && exit 0 + else + if [ "$1" = "-d" ]; then + shift + set -x + fi + + if [ "$1" = "-v" ]; then + read help < $TOP/ops/VERSION + echo t-ops v$help + exit 0 + fi + fi + sub_num=$# + sub_show=$# + work_one=$1 + while [ $# -gt 0 ]; do + if [ -x "$TOP/$sub/$RUN" ]; then + break + elif [ "$1" = "-h" -o "$1" = "help" ]; then + [ "$help" = "comp" ] && exit 0 + help=help + break + elif [[ "$1" =~ ^[^a-zA-Z0-9] || "$1" =~ [^-_.a-zA-Z0-9] ]]; then + bad=1 + break + elif [ -d "$TOP/$sub/$1" ]; then + sub="$sub/$1" + shift + else + bad=1 + #bad cmd found, no bashcomp available + [ -n "$bashcomp" ] && exit 0 + break + fi + done + + if [ -z "$bad" -a -x "$TOP/$sub/$RUN" ]; then + #leaf cmd found, no bashcomp available + if [ "$help" != "comp" ]; then + #run cmd with remain args + record_function_call $sub + exec "$TOP/$sub/$RUN" "$@" + exit 255 + fi + # do leaf cmd completion + sub="$sub/tos-comp" + while [ $# -gt 0 ]; do + sub="$sub.$1" + shift + done + [ -r "$TOP/$sub.txt" ] && cat "$TOP/$sub.txt" + exit 0 + fi + + if [ -z "$help" ]; then + # print error message if not bashcomp + local ops="${sub//\// }" + [ $# -gt 0 ] && echo ERROR: tos $ops: command \"$1\" invalid + echo -e "\033[35mif find sub command, enter: \"t-ops find \$anything, such as: t-ops find file/文件\"\033[0m" + echo -e "\033[35mAvailable command for \"tos $ops\": \033[0m" + fi + + cd "$TOP/$sub" || exit 1 + + local msg run op n=0 w=0 + declare -a ops msgs files + + if [ "$help" = "comp" ]; then + # bashcomp only search one level commands + files=( * ) + else + # -h/help print all sub commands + IFS=$'\n' files=( $(find . -type d -name "[a-zA-Z0-9]*" -print) ) + fi + + var=$1 + for op in "${files[@]}"; do + # strip leadding ./ + op="${op#./}" + #filter out invalid characters + [[ "$op" =~ (^|/)[^a-zA-Z0-9] ]] && continue + [[ "$op" =~ [^-_.a-zA-Z0-9/] ]] && continue + #need non-empty, readable $DSC file + [ -r "$op/$DSC" ] || continue + #find command need show all dir, other command show its sub only + if [ $sub_show -lt 1 ] || [[ -z $(strstr $work_one "find") && -z $(strstr $work_one "show_comm") ]]; then + tmp=$op + while [ $sub_num -ge 1 ]; do + sub_num=$(($sub_num - 1)) + tmp="${op#/}" + done + if [ ! -z $(strstr $tmp "/") ]; then + continue + fi + fi + read msg < "$op/$DSC" + [ -z "$msg" ] && continue + + if [ $sub_show -eq 1 ] && [ ! -z $(strstr $work_one "show_comm") ]; then + if [ ! -f "$op/$RUN" ]; then + continue + fi + fi + + if [ "$help" = "comp" ]; then + # bashcomp don't like tab + #ATTN: tab after double //, convert tab to space + msg=${msg// / } + else + # convert / to space + op="${op//\// }" + fi + + if [ ! -z "$var" ] && [ -z $(strstr $op $var) ]; then + continue + fi + + #save and update max-width + [ ${#op} -gt $w ] && w=${#op} + eval 'ops['$n']="$op"' + eval 'msgs['$n']="$msg"' + n=$((n+1)) + done + + #show results + w="%-${w}s %s\n" + op=0 + #support find + if [ $sub_show -ge 1 ] && [ ! -z $(strstr $work_one "find") ]; then + if [ $sub_show -eq 1 ]; then + printf "\033[32menter anything you want, such as: t-ops find file/fs/net .etc\033[0m\n" + printf "\033[32menter anything you want, such as: t-ops find 文件/文件相关/网络相关/等\033[0m\n" + elif [ $sub_show -eq 2 ] && [ ! -z $(strstr $1 "find") ]; then + while [ $op -lt $n ]; do + if [ ! -z $(strstr "${ops[$op]}" $2) ] || [ ! -z $(strstr "${msgs[$op]}" $2) ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + fi + elif [ $sub_show -ge 1 ] && [ ! -z $(strstr $work_one "show_comm") ]; then + i=0 + printf "\033[35m-------show HCI command-------\033[0m\n" + while [ $op -lt $n ]; do + if [ ! -z $(strstr ${ops[$op]} "interaction") ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + unset ops[$op] + unset msgs[$op] + i=$((i+1)) + fi + op=$((op+1)) + done + printf "\033[35m-------show paramter command-------\033[0m\n" + op=0 + while [ $op -lt $n ]; do + if [ ! -z "${ops[$op]}" ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + #show sub level( < level 1) command + elif [ $sub_show -ge 1 ]; then + while [ $op -lt $n ]; do + printf "$w" "${ops[$op]}" "${msgs[$op]}" + op=$((op+1)) + done + #show level 1 command in order + else + i=0 + while [ $op -lt $n ]; do + if [ ! -z $(strstr ${ops[$op]} "interaction") ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + unset ops[$op] + unset msgs[$op] + i=$((i+1)) + fi + op=$((op+1)) + done + op=0 + n=$(($n - $i + 1)) + while [ $op -lt $n ]; do + if [ ! -z "${ops[$op]}" ]; then + printf "$w" "${ops[$op]}" "${msgs[$op]}" + fi + op=$((op+1)) + done + fi + + exit 0 +} diff --git a/sys-manage/treediff.pl b/sys-manage/treediff.pl new file mode 100755 index 0000000000000000000000000000000000000000..7787c37297cba689d0b0cadeb5ac78c0b895b15e --- /dev/null +++ b/sys-manage/treediff.pl @@ -0,0 +1,479 @@ +#!/usr/bin/perl +# +# Compare 2 distribution tree +# v1.1 +# by samuelliao +# +# 11-18-2011 v1.0 initial release +# 11-18-2011 v1.1 fix typo in same_id(): id0->id1 +# + +###################################### +# GLOBALS +###################################### + +# difference counter +$diffs = 0; + +# brief message, only print differ pathname, suppress contents +$brief = 0; + + +###################################### +# EXCLUDED PATH MANAGEMENT +###################################### + +# left and right topdir string length +@prefixlength; + +# a hash of excluded path +%excludedhash; + +# add pattern to excluded path hash +sub add_excluded_pattern($) { + my $fn = $_[0]; + if($fn =~ m{^\.[^/]*$}) { + $excludedhash{$fn} = 1; + } elsif($fn =~ m{^/}) { + $excludedhash{$fn} = 1; + } else { + syntax("Invalid excluded pattern, must be /XXX, /XXX/ or .XXX"); + } +} + +# load excluded pattern from file +sub load_excluded_list($) { + my $fn = $_[0]; + + open FILE, "<$fn" or do { + print "ERROR: cannot open file $fn\n"; + return undef; + }; + + while() { + chomp; + add_excluded_pattern($_); + } + close FILE; +} + +sub init_prefix($$) { + @prefixlength = (length($_[0]), length($_[1])); +} + +# check excluded list, second parameter is 0(left)/1(right) path name +sub excluded($$) { + my $fn = $_[0]; + return 1 if $excludedhash{substr($fn, $prefixlength[$_[1]])}; + $fn =~ s{.*\.}{.}; + return 1 if $excludedhash{$fn}; + return 0; +} + +# checl excluded subdir, always check left tree +sub excludeddir($) { + return $excludedhash{substr($_[0], $prefixlength[0]).'/'}; +} + + +################################################ +# Ownership translation +################################################ +%idhash = {}; + +sub load_mapping_file($$) { + my $fn = $_[0]; + my $type = $_[1]; + + open FILE, "<$fn" or return undef; + + while() { + my @item = split /:/; + $idhash{$type.$item[2]} = $item[0]; + } + close FILE; +} + +sub load_ownership_mapping($$$) { + my $file0 = $_[0]; + my $file1 = $_[1]; + my $etcdir = $_[2]; + + load_mapping_file("$file0/$etcdir/passwd", "u0"); + load_mapping_file("$file0/$etcdir/group", "g0"); + load_mapping_file("$file1/$etcdir/passwd", "u1"); + load_mapping_file("$file1/$etcdir/group", "g1"); +} + +sub same_id($$$) { + my $id0 = $_[0]; + my $id1 = $_[1]; + my $t0 = $_[2]; + my $n0 = $idhash{$t0."0".$id0}; + my $n1 = $idhash{$t0."1".$id1}; + return $n0 eq $n1 if defined($n0) and defined($n1); + return 1 if $id0 == $id1; + return 0; +} + +sub id2name($$$) { + my $id = $_[0]; + my $t0 = $_[1]; + my $t1 = $_[2]; + my $name = $idhash{$t0.$t1.$id}; + $id .= "($name)" if defined($name) and $name ne ''; + return $id; +} + +################################################ +# File Summary display +################################################ + +# constants from stat(), lstat() results: +# 0 dev device number of filesystem +# 1 ino inode number +# 2 mode file mode (type and permissions) +# 3 nlink number of (hard) links to the file +# 4 uid numeric user ID of file's owner +# 5 gid numeric group ID of file's owner +# 6 rdev the device identifier (special files only) +# 7 size total size of file, in bytes +# 8 atime last access time in seconds since the epoch +# 9 mtime last modify time in seconds since the epoch +# 10 ctime inode change time in seconds since the epoch (*) +# 11 blksize preferred block size for file system I/O +# 12 blocks actual number of blocks allocated +sub MODE(){ 2 } +sub UID() { 4 } +sub GID() { 5 } +sub RDEV() { 6 } +sub SIZE() { 7 } +# constants replace stat results +sub TYPE() { 0 } +sub PATH() { 1 } +sub TREE() { 3 } +sub SYMLINK() { 6 } #SYMLINK and RDEV is exclusive +sub COMPARE() { 8 } +sub SUMMARY() { 9 } + +sub summary_with_size($$) { + my $info = $_[0]; + + my $head = $$info[TREE] ? '+' : '-'; + my $file = $$info[PATH]; + my $type = $$info[TYPE]; + my $mode = sprintf "%o", $$info[MODE]; + my $uid = id2name($$info[UID], "u", $$info[TREE]); + my $gid = id2name($$info[GID], "g", $$info[TREE]); + my $size = $$info[SIZE]; + print "$head$type mode=$mode uid=$uid gid=$gid size=$size $file\n"; +} + +sub summary_with_symlink($$) { + my $info = $_[0]; + + my $head = $$info[TREE] ? '+' : '-'; + my $file = $$info[PATH]; + my $type = $$info[TYPE]; + my $mode = sprintf "%o", $$info[MODE]; + my $uid = id2name($$info[UID], "u", $$info[TREE]); + my $gid = id2name($$info[GID], "g", $$info[TREE]); + my $symlink = $$info[SYMLINK]; + print "$head$type uid=$uid gid=$gid $file --> $symlink\n"; +} + +sub summary_with_device($$) { + my $info = $_[0]; + + my $head = $$info[TREE] ? '+' : '-'; + my $file = $$info[PATH]; + my $type = $$info[TYPE]; + my $mode = sprintf "%o", $$info[MODE]; + my $uid = id2name($$info[UID], "u", $$info[TREE]); + my $gid = id2name($$info[GID], "g", $$info[TREE]); + my $devno = sprintf "%d:%d", $$info[RDEV]>>8, $$info[RDEV]&255; + print "$head$type mode=$mode uid=$uid gid=$gid devno=$devno $file\n"; +} + +sub summary_base($$) { + my $info = $_[0]; + + my $head = $$info[TREE] ? '+' : '-'; + my $file = $$info[PATH]; + my $type = $$info[TYPE]; + my $mode = sprintf "%o", $$info[MODE]; + my $uid = id2name($$info[UID], "u", $$info[TREE]); + my $gid = id2name($$info[GID], "g", $$info[TREE]); + printf "$head$type mode=$mode uid=$uid gid=$gid $file\n"; +} + +################################################ +# File Compare Routines +################################################ + +# print differ reason and summary info +sub printdiff($$$) { + my $msg = $_[0]; + my $info0 = $_[1]; + my $info1 = $_[2]; + ++$diffs; + print "$msg $$info0[PATH] and $$info1[PATH] differ\n"; + unless($brief) { + my $func; + $func = $$info0[SUMMARY]; &$func($info0); + $func = $$info1[SUMMARY]; &$func($info1); + } +} + +# scan file list in a directory +sub getfilelist($) { + my $dirname = shift; + my @filelist; + if(opendir DIR, $dirname) { + my @tmp = readdir DIR; + for my $f(@tmp) { + next if $f eq '.'; + next if $f eq '..'; + push @filelist, $f; + } + closedir DIR; + } + return sort {$a cmp $b} @filelist; +} + +#nothing to compare +sub compare_none($$) { + return 0; +} + +#compare two symlink target +sub compare_symlink($$) { + my $info0 = $_[0]; + my $info1 = $_[1]; + + printdiff("Symlinks", $info0, $info1) if $$info0[SYMLINK] ne $$info1[SYMLINK]; +} + +#compare device's devno +sub compare_device($$) { + my $info0 = $_[0]; + my $info1 = $_[1]; + + printdiff("Devices", $info0, $info1) if $$info0[RDEV] ne $$info1[RDEV]; +} + +#bridge to external diff compare regular files +sub compare_file($$) { + my $info0 = $_[0]; + my $info1 = $_[1]; + + my @cmdline = ( "diff", "-u", $$info0[PATH], $$info1[PATH]); + $cmdline[1] = "-uq" if $brief; + # use fork+exec+wait instead system, because system ignore SIGINT + if(fork()==0) { + exec @cmdline; + exit(254); + } + wait; + ++$diffs if $?; +} + +#report only in left/right tree, arguments: $dir $fn $pos +sub onlyin($$$) { + my $dir = $_[0]; + my $fn = $_[1]; + my $pos = $_[2]; + + unless(excluded("$dir/$fn", $pos)) { + ++$diffs; + print "Only in $dir/$fn\n"; + unless($brief) { + my $info = fileinfo("$dir/$fn", $pos); + my $func = $$info[SUMMARY]; + &$func($info); + } + } +} + +#compare directory tree +sub compare_subdir($$) { + my $info0 = $_[0]; + my $info1 = $_[1]; + + my $dir0 = $$info0[PATH]; + return 1 if excludeddir($dir0); + my $dir1 = $$info1[PATH]; + + my @list0 = getfilelist($dir0); + my @list1 = getfilelist($dir1); + + for my $f (@list0) { + onlyin($dir1, shift @list1, 1) while $#list1 >= 0 && $f gt $list1[0]; + if($#list1 < 0 || $f lt $list1[0]) { + onlyin($dir0, $f, 0); + } else { + &compare_all("$dir0/$f", "$dir1/$f") unless excluded("$dir0/$f", 0); + shift @list1; + } + } + onlyin($dir1, $_, 1) foreach @list1; +} + +# return file information object +sub fileinfo($$) { + my $fn = $_[0]; + my @info = lstat($fn); + + if($#info < 0) { + $info[TREE] = $_[1]; + $info[PATH] = $fn; + $info[TYPE] = "NOTFOUND"; + $info[COMPARE] = \&compare_none; + $info[SUMMARY] = \&compare_none; + return \@info; + } + + $info[TREE] = $_[1]; + $info[PATH] = $fn; + $info[MODE] = $info[MODE] & 07777; + $info[TYPE] = 'UNKNOWN'; + $info[COMPARE] = \&compare_none; + $info[SUMMARY] = \&summary_with_size; + + if(-l $fn) { + $info[TYPE] = 'SYMLINK'; + $info[COMPARE] = \&compare_symlink; + $info[SUMMARY] = \&summary_with_symlink; + $info[SYMLINK] = readlink $fn; + } elsif(-d $fn) { + $info[TYPE] = 'DIR'; + $info[COMPARE] = \&compare_subdir; + $info[SUMMARY] = \&summary_with_size; + } elsif(-b $fn) { + $info[TYPE] = 'BLOCKDEV'; + $info[COMPARE] = \&compare_device; + $info[SUMMARY] = \&summary_with_device; + } elsif(-c $fn) { + $info[TYPE] = 'CHARDEV'; + $info[COMPARE] = \&compare_device; + $info[SUMMARY] = \&summary_with_device; + } elsif(-S $fn) { + $info[TYPE] = 'SOCKET'; + $info[SUMMARY] = \&summary_base; + } elsif(-p $fn) { + $info[TYPE] = 'PIPE'; + $info[SUMMARY] = \&summary_base; + } elsif(-f $fn) { + $info[TYPE] = 'FILE'; + $info[COMPARE] = \&compare_file; + $info[SUMMARY] = \&summary_with_size; + } + return \@info; +} + +#generic routine, compare all file types +sub compare_all($$) { + my $info0 = fileinfo($_[0], 0); + my $info1 = fileinfo($_[1], 1); + + if($$info0[TYPE] ne $$info1[TYPE]) { + printdiff("Filetype", $info0, $info1); + return 1; + } + + if(same_id($$info0[UID], $$info1[UID], 'u') == 0) { + printdiff("Ownership", $info0, $info1); + } elsif(same_id($$info0[GID], $$info1[GID], 'g') == 0) { + printdiff("Ownership", $info0, $info1); + } elsif($$info0[MODE] ne $$info1[MODE]) { + printdiff("Permissions", $info0, $info1); + } + + my $func = $$info0[COMPARE]; + return &$func($info0, $info1); +} + +################################################ +# Syntax +################################################ + +sub syntax(;$) { + my $msg = $_[0]; + print "Syntax Error: $msg\n\n" if $msg ne ''; + print "Usage: treediff.pl [-q] [-x path] [-X file] path0 path1\n"; + print " -q only print diff file names\n"; + print " -c subdir pathname to /etc to lookup passwd/group file\n"; + print " -x pattern exclude pattern, eg:\n"; + print " /tmp excude /tmp and subdir contents\n"; + print " /tmp/ excude subdir contents, keep compare /tmp\n"; + print " /etc/mtab excude file /etc/mtab\n"; + print " .tmp exclude all *.tmp files\n"; + print " -X file read excluded pattern from file\n"; + print "\n"; + print "Return value:\n"; + print " 0: same\n"; + print " 1: some difference\n"; + print " 254: Interrupted\n"; + print " 255: Syntax error\n"; + + exit(255); +} + +################################################ +# Main routines +################################################ + +syntax() if $#ARGV == -1; + +#parse command line +$file0 = undef; +$file1 = undef; +$etcdir = "/etc"; +while(defined($arg = shift @ARGV)) { + if(substr($arg, 0, 1) eq '-') { + if($arg eq '-q') { + $brief = 1; + } elsif($arg eq '-c') { + syntax("option -c requre a pathname") if $#ARGV < 0; + $etcdir = shift @ARGV; + } elsif($arg eq '-x') { + syntax("option -x requre a pathname") if $#ARGV < 0; + add_excluded_pattern(shift @ARGV); + } elsif($arg eq '-X') { + syntax("option -X requre a list file") if $#ARGV < 0; + load_excluded_list(shift @ARGV); + } else { + syntax("unknown option $arg"); + } + } elsif(defined($file1)) { + syntax("too many arguments"); + } elsif(defined($file0)) { + syntax("$arg non-exists") unless -e $arg; + $file1 = $arg; + } else { + syntax("$arg non-exists") unless -e $arg; + $file0 = $arg; + } +} + +syntax("missing left tree argument") unless defined($file0); +syntax("missing right tree argument") unless defined($file1); + +sub intr() { + print "\n<<>>\n"; + exit(254); +} +$SIG{'HUP'} = 'IGNORE'; +$SIG{'CHLD'} = 'DEFAULT'; +$SIG{'QUIT'} = \&intr; +$SIG{'INT'} = \&intr; +$SIG{'TERM'} = \&intr; +# for excluded pathname checking +init_prefix($file0, $file1); +# for ownership checking +load_ownership_mapping($file0, $file1, $etcdir); +# do the compare +compare_all($file0, $file1); + +# zero is same, one has difference +exit($diffs?1:0);